DIRKMJK/download.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Achtergrond hier
To run the code, create a project directory with subdirectories script and data; in the data subdirectory create subdirectories html and processed. Store the python files in the script subdirectory and run from there.
Further note that the START_URL in download.py will determine what kind of content will be downloaded. You may want to inspect the page in a browser before running the script to make sure you’re getting the results you’re after. In the browser, you can apply filters to your results and then copy the url into download.py, but note that filters use tags that may not be present in old traffic decisions; therefore a safer approach may be to just use a search term.

  
## download.py
"""Download traffic decisions from officielebekendmakingen.nl"""

from pathlib import Path
import requests
from bs4 import BeautifulSoup as bs

BASE_URL = 'https://zoek.officielebekendmakingen.nl'
START_URL = 'https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and(dt.available%253e=%222016-01-01%22)and((w.publicatienaam==%22Staatscourant%22))%20AND%20w.verkeersbordcode==%22A1%22&zv=&pg=10&col=Staatscourant&svel=Publicatiedatum&svol=Aflopend&sf=vb|A1'
START_URL = 'https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and((w.publicatienaam==%22Staatscourant%22))and((cql.textAndIndexes=%2230+km%22+or+cql.textAndIndexes=%2230km%22))&zv=%252230+km%2522+OR++%252230km%2522&col=Staatscourant'
DIR_HTML = Path('../data/html')


def extract_page_urls(soup):
    """Extract urls of web pages containing a decision"""
    links = soup.find_all('a', {'id': 'publicatieHyperLink'})
    return [BASE_URL + l.get('href') for l in links]


def extract_next_link(soup):
    """Extract link to next page with results"""
    next_link = soup.find('a', {'aria-label': 'Next'})
    try:
        return BASE_URL + next_link.get('href')
    except TypeError:
        return None


def collect_all_page_urls():
    """Go through all result pages to extract page urls"""
    next_link = START_URL
    page_urls = []
    while next_link:
        html = requests.get(next_link).text
        soup = bs(html, 'lxml')
        page_urls.extend(extract_page_urls(soup))
        next_link = extract_next_link(soup)
    return page_urls


def download(page_urls):
    """Download and save all html pages from page urls"""
    for url in page_urls:
        name = url.split('/')[-1]
        path = DIR_HTML / name
        html = requests.get(url).text
        path.write_text(html)


if __name__ == '__main__':
    downloaded = [p.name for p in DIR_HTML.glob('*.html')]
    print('collecting page urls')
    page_urls = collect_all_page_urls()
    page_urls = [u for u in page_urls if u.split('/')[-1] not in downloaded]
    print('downloading {} pages'.format(len(page_urls)))
    download(page_urls)

## process.py
"""Process traffic decisions"""

from pathlib import Path
import re
from bs4 import BeautifulSoup as bs
import pandas as pd


PATTERN_KM_SIGN = r'A\s?0?1[\-]?([0-9]{2})'
PATTERN_KM_ZONE = r'A\s?0?1\s?\(zone ([0-9]{2})\)'
PATTERN_KM = r'[^0-9]([0-9]{2,3})\s?k[mi]'
PATTERN_SIGN = r'[Aa]\s?\.?0?1'
DIR_HTML = Path('../data/html')
TERMS_MAX_SPEED = [
    'zone',
    'zonale',
    'maximum snelheid',
    'maximumsnelheid',
    'snelheidslimiet',
    'maximum toegestane snelheid'
]
INTRODUCED = [
    'geplaatst',
    'plaatsen',
    'plaatsing',
    'vaststellen',
    'instellen',
    'uitbreiden',
    'voorzien',
    'aanbrengen',
    'in te stellen'
]
CANCELLED = [
    'verwijder',
    'intrekken',
    'opheffen'
]
UNDER_4M = 'tijdelijke verkeersmaatregel van kortere duur dan 4 maanden'
OVER_4M = 'tijdelijke verkeersmaatregel van langere duur dan 4 maanden'


def decision_starts(line):
    """Check if line announces start of decision"""
    if line.startswith('B E S L U I'):
        return True
    line = ''.join([c for c in line if c.isalpha()]).lower()
    for string in ['besluittot', 'besluitentot', 'besluitenwij']:
        if line.startswith(string):
            return True
    return line in ['besluit', 'besluiten', 'hetbesluit']


def process_line(line):
    """Extract speed decision from sentence"""
    speeds = re.findall(PATTERN_KM_SIGN, line)
    speeds.extend(re.findall(PATTERN_KM_ZONE, line))
    if re.findall(PATTERN_SIGN, line):
        speeds.extend(re.findall(PATTERN_KM, line))
    else:
        for term in TERMS_MAX_SPEED:
            if term in line.lower():
                speeds.extend(re.findall(PATTERN_KM, line))
                break
    if not speeds:
        return None
    introduced = False
    for term in INTRODUCED:
        if term.lower() in line.lower():
            introduced = True
    cancelled = False
    for term in CANCELLED:
        if term.lower() in line.lower():
            cancelled = True
    return set(speeds), introduced, cancelled


def parse_html(html):
    """Extract relevant info from html"""
    soup = bs(html, 'lxml')
    decision_txt = soup.text
    decision_start_idx = 0
    lines = decision_txt.split('\n')
    for i, line in enumerate(lines):
        if decision_starts(line):
            decision_start_idx = i
    lines = lines[decision_start_idx:]
    lines = [l for l in lines if not l.lower().startswith('dat')]
    measures = [(l, process_line(l)) for l in lines]
    measures = [d for d in measures if d[1]]
    authority = soup.find('meta', {'name': 'OVERHEID.authority'})
    if authority:
        authority = authority.get('content')
    times = [t for t in soup.find_all('time') if t]
    for t in times:
        dt = t.get('datetime')
    decision_types = soup.find_all('meta', {'name': 'Type verkeersbesluit'})
    if decision_types:
        decision_types = ';'.join([dt.get('content') for dt in decision_types])
    else:
        decision_types = ''
    coords = soup.find('meta', {'data-scheme': 'OVERHEID.EPSG28992'})
    if coords:
        coords = coords.get('content')
    return {
        'authority': authority,
        'datetime': dt,
        'decision_types': decision_types,
        'coords': coords,
        'measures': measures
    }


def parse_dt(dt, to='month'):
    """Convert dt to month or year"""
    if pd.isnull(dt):
        return None
    dt = dt.split(' ')[0]
    d, m, Y = dt.split('-')
    if to == 'year':
        return Y
    if len(m) == 1:
        m = '0{}'.format(m)
    return '{}-{}'.format(Y, m)


def is_temporary(decision_type):
    """Check if decision is marked as temporary"""
    if not isinstance(decision_type, str):
        return 'not_temp'
    if UNDER_4M in decision_type.lower():
        return 'under_4m'
    if OVER_4M in decision_type.lower():
        return 'over_4m'
    return 'not_temp'


def count_measures(measures):
    """Count the numer of measures, by type"""
    item = {}
    for _, (speeds, introduced, cancelled) in measures:
        for speed in speeds:
            if speed not in item:
                item[speed] = {
                    'introduced': 0,
                    'cancelled': 0
                }
            if introduced:
                item[speed]['introduced'] += 1
            if cancelled:
                item[speed]['cancelled'] += 1
    return item


def create_df():
    """Create dataframe containing decisions"""
    decisions = []
    for path in DIR_HTML.glob('*.html'):
        html = path.read_text()
        decision = parse_html(html)
        measure_counts = count_measures(decision['measures'])
        for speed, value in measure_counts.items():
            decision['introduced_{}'.format(speed)] = value['introduced']
            decision['cancelled_{}'.format(speed)] = value['cancelled']
        decision['file_name'] = path.name
        decisions.append(decision)
    df = pd.DataFrame(decisions)
    df['month'] = df.datetime.apply(parse_dt)
    df['year'] = df.datetime.map(lambda x: parse_dt(x, 'year'))
    df['temp'] = df.decision_types.apply(is_temporary)
    return df


def main():
    df = create_df()
    df.to_csv('../data/processed/verkeersbesluiten.csv')


if __name__ == '__main__':
    main()
	"""Download traffic decisions from officielebekendmakingen.nl"""

	from pathlib import Path
	import requests
	from bs4 import BeautifulSoup as bs

	BASE_URL = 'https://zoek.officielebekendmakingen.nl'
	START_URL = 'https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and(dt.available%253e=%222016-01-01%22)and((w.publicatienaam==%22Staatscourant%22))%20AND%20w.verkeersbordcode==%22A1%22&zv=&pg=10&col=Staatscourant&svel=Publicatiedatum&svol=Aflopend&sf=vb\|A1'
	START_URL = 'https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and((w.publicatienaam==%22Staatscourant%22))and((cql.textAndIndexes=%2230+km%22+or+cql.textAndIndexes=%2230km%22))&zv=%252230+km%2522+OR++%252230km%2522&col=Staatscourant'
	DIR_HTML = Path('../data/html')


	def extract_page_urls(soup):
	"""Extract urls of web pages containing a decision"""
	links = soup.find_all('a', {'id': 'publicatieHyperLink'})
	return [BASE_URL + l.get('href') for l in links]


	def extract_next_link(soup):
	"""Extract link to next page with results"""
	next_link = soup.find('a', {'aria-label': 'Next'})
	try:
	return BASE_URL + next_link.get('href')
	except TypeError:
	return None


	def collect_all_page_urls():
	"""Go through all result pages to extract page urls"""
	next_link = START_URL
	page_urls = []
	while next_link:
	html = requests.get(next_link).text
	soup = bs(html, 'lxml')
	page_urls.extend(extract_page_urls(soup))
	next_link = extract_next_link(soup)
	return page_urls


	def download(page_urls):
	"""Download and save all html pages from page urls"""
	for url in page_urls:
	name = url.split('/')[-1]
	path = DIR_HTML / name
	html = requests.get(url).text
	path.write_text(html)


	if __name__ == '__main__':
	downloaded = [p.name for p in DIR_HTML.glob('*.html')]
	print('collecting page urls')
	page_urls = collect_all_page_urls()
	page_urls = [u for u in page_urls if u.split('/')[-1] not in downloaded]
	print('downloading {} pages'.format(len(page_urls)))
	download(page_urls)
	"""Process traffic decisions"""

	from pathlib import Path
	import re
	from bs4 import BeautifulSoup as bs
	import pandas as pd


	PATTERN_KM_SIGN = r'A\s?0?1[\-]?([0-9]{2})'
	PATTERN_KM_ZONE = r'A\s?0?1\s?\(zone ([0-9]{2})\)'
	PATTERN_KM = r'[^0-9]([0-9]{2,3})\s?k[mi]'
	PATTERN_SIGN = r'[Aa]\s?\.?0?1'
	DIR_HTML = Path('../data/html')
	TERMS_MAX_SPEED = [
	'zone',
	'zonale',
	'maximum snelheid',
	'maximumsnelheid',
	'snelheidslimiet',
	'maximum toegestane snelheid'
	]
	INTRODUCED = [
	'geplaatst',
	'plaatsen',
	'plaatsing',
	'vaststellen',
	'instellen',
	'uitbreiden',
	'voorzien',
	'aanbrengen',
	'in te stellen'
	]
	CANCELLED = [
	'verwijder',
	'intrekken',
	'opheffen'
	]
	UNDER_4M = 'tijdelijke verkeersmaatregel van kortere duur dan 4 maanden'
	OVER_4M = 'tijdelijke verkeersmaatregel van langere duur dan 4 maanden'


	def decision_starts(line):
	"""Check if line announces start of decision"""
	if line.startswith('B E S L U I'):
	return True
	line = ''.join([c for c in line if c.isalpha()]).lower()
	for string in ['besluittot', 'besluitentot', 'besluitenwij']:
	if line.startswith(string):
	return True
	return line in ['besluit', 'besluiten', 'hetbesluit']


	def process_line(line):
	"""Extract speed decision from sentence"""
	speeds = re.findall(PATTERN_KM_SIGN, line)
	speeds.extend(re.findall(PATTERN_KM_ZONE, line))
	if re.findall(PATTERN_SIGN, line):
	speeds.extend(re.findall(PATTERN_KM, line))
	else:
	for term in TERMS_MAX_SPEED:
	if term in line.lower():
	speeds.extend(re.findall(PATTERN_KM, line))
	break
	if not speeds:
	return None
	introduced = False
	for term in INTRODUCED:
	if term.lower() in line.lower():
	introduced = True
	cancelled = False
	for term in CANCELLED:
	if term.lower() in line.lower():
	cancelled = True
	return set(speeds), introduced, cancelled


	def parse_html(html):
	"""Extract relevant info from html"""
	soup = bs(html, 'lxml')
	decision_txt = soup.text
	decision_start_idx = 0
	lines = decision_txt.split('\n')
	for i, line in enumerate(lines):
	if decision_starts(line):
	decision_start_idx = i
	lines = lines[decision_start_idx:]
	lines = [l for l in lines if not l.lower().startswith('dat')]
	measures = [(l, process_line(l)) for l in lines]
	measures = [d for d in measures if d[1]]
	authority = soup.find('meta', {'name': 'OVERHEID.authority'})
	if authority:
	authority = authority.get('content')
	times = [t for t in soup.find_all('time') if t]
	for t in times:
	dt = t.get('datetime')
	decision_types = soup.find_all('meta', {'name': 'Type verkeersbesluit'})
	if decision_types:
	decision_types = ';'.join([dt.get('content') for dt in decision_types])
	else:
	decision_types = ''
	coords = soup.find('meta', {'data-scheme': 'OVERHEID.EPSG28992'})
	if coords:
	coords = coords.get('content')
	return {
	'authority': authority,
	'datetime': dt,
	'decision_types': decision_types,
	'coords': coords,
	'measures': measures
	}


	def parse_dt(dt, to='month'):
	"""Convert dt to month or year"""
	if pd.isnull(dt):
	return None
	dt = dt.split(' ')[0]
	d, m, Y = dt.split('-')
	if to == 'year':
	return Y
	if len(m) == 1:
	m = '0{}'.format(m)
	return '{}-{}'.format(Y, m)


	def is_temporary(decision_type):
	"""Check if decision is marked as temporary"""
	if not isinstance(decision_type, str):
	return 'not_temp'
	if UNDER_4M in decision_type.lower():
	return 'under_4m'
	if OVER_4M in decision_type.lower():
	return 'over_4m'
	return 'not_temp'


	def count_measures(measures):
	"""Count the numer of measures, by type"""
	item = {}
	for _, (speeds, introduced, cancelled) in measures:
	for speed in speeds:
	if speed not in item:
	item[speed] = {
	'introduced': 0,
	'cancelled': 0
	}
	if introduced:
	item[speed]['introduced'] += 1
	if cancelled:
	item[speed]['cancelled'] += 1
	return item


	def create_df():
	"""Create dataframe containing decisions"""
	decisions = []
	for path in DIR_HTML.glob('*.html'):
	html = path.read_text()
	decision = parse_html(html)
	measure_counts = count_measures(decision['measures'])
	for speed, value in measure_counts.items():
	decision['introduced_{}'.format(speed)] = value['introduced']
	decision['cancelled_{}'.format(speed)] = value['cancelled']
	decision['file_name'] = path.name
	decisions.append(decision)
	df = pd.DataFrame(decisions)
	df['month'] = df.datetime.apply(parse_dt)
	df['year'] = df.datetime.map(lambda x: parse_dt(x, 'year'))
	df['temp'] = df.decision_types.apply(is_temporary)
	return df


	def main():
	df = create_df()
	df.to_csv('../data/processed/verkeersbesluiten.csv')


	if __name__ == '__main__':
	main()