DIRKMJK/get_kvk.py

## get_kvk.py
"""Download open data from the Dutch Company Register, unzip and store as csv.
Not guaranteed to yield complete and accurate data. For background see:
https://dirkmjk.nl/en/187/open-company-data-in-the-netherlands
"""

from pathlib import Path
import io
import zipfile
from zipfile import BadZipFile
import xml.etree.ElementTree as ET
import requests
import pandas as pd


URL_REGISTER = 'https://static.kvk.nl/download/kvk-open-data-set-handelsregister.zip'
BASE_URL_REPORTS = 'https://static.kvk.nl/download/kvk-open-data-set-jaarrekeningen{}.zip'
PATH_REGISTER = '../data/kvk-open-data-set-handelsregister.csv'
DIR_ANNUAL_REPORTS = Path('../data/annual_reports')


def get_register():
    """Download and process company register"""
    r = requests.get(URL_REGISTER)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall('../data/')
    register = pd.read_csv(PATH_REGISTER, encoding='latin-1')
    register.to_csv(PATH_REGISTER, encoding='utf8')


def get_annual_reports():
    """Download and unzip annual reports"""
    i = 0
    while True:
        url = BASE_URL_REPORTS.format(i)
        r = requests.get(url)
        try:
            z = zipfile.ZipFile(io.BytesIO(r.content))
            dir_unzip = DIR_ANNUAL_REPORTS / str(i)
            print(dir_unzip)
            dir_unzip.mkdir(exist_ok=True)
            z.extractall(dir_unzip)
            i += 1
        except BadZipFile:
            break


def parse_recur(element, report):
    """XML to dict"""
    key = None
    value = None
    for attrib_key in element.attrib:
        if attrib_key.endswith('key'):
            key = element.attrib[attrib_key]
        elif attrib_key.endswith('value'):
            value = element.attrib[attrib_key]
    if key and value:
        report[key] = value
    for child in element:
        parse_recur(child, report)
    return report


def process_ar(path):
    """Process annual report """
    tree = ET.parse(path)
    root = tree.getroot()
    report = {'filename': path.stem}
    for child in root:
        report = parse_recur(child, report)
    return report


if __name__ == '__main__':
    download = input('download files (y/n) ')
    if download.lower() == 'y':
        print('downloading and unzipping register')
        register = get_register()
        print('downloading and unzipping annual reports')
        get_annual_reports()
    print('parsing xml files')
    for subdir in DIR_ANNUAL_REPORTS.glob('*'):
        if not subdir.is_dir():
            continue
        print(subdir)
        reports_list = [
            process_ar(path)
            for path
            in subdir.glob('*.xml')
        ]
        reports_df = pd.DataFrame(reports_list)
        path_df = Path('../data/') / 'annual_reports_{}.csv'.format(subdir.stem)
        reports_df.to_csv(path_df, index=False)
	"""Download open data from the Dutch Company Register, unzip and store as csv.
	Not guaranteed to yield complete and accurate data. For background see:
	https://dirkmjk.nl/en/187/open-company-data-in-the-netherlands
	"""

	from pathlib import Path
	import io
	import zipfile
	from zipfile import BadZipFile
	import xml.etree.ElementTree as ET
	import requests
	import pandas as pd


	URL_REGISTER = 'https://static.kvk.nl/download/kvk-open-data-set-handelsregister.zip'
	BASE_URL_REPORTS = 'https://static.kvk.nl/download/kvk-open-data-set-jaarrekeningen{}.zip'
	PATH_REGISTER = '../data/kvk-open-data-set-handelsregister.csv'
	DIR_ANNUAL_REPORTS = Path('../data/annual_reports')


	def get_register():
	"""Download and process company register"""
	r = requests.get(URL_REGISTER)
	z = zipfile.ZipFile(io.BytesIO(r.content))
	z.extractall('../data/')
	register = pd.read_csv(PATH_REGISTER, encoding='latin-1')
	register.to_csv(PATH_REGISTER, encoding='utf8')


	def get_annual_reports():
	"""Download and unzip annual reports"""
	i = 0
	while True:
	url = BASE_URL_REPORTS.format(i)
	r = requests.get(url)
	try:
	z = zipfile.ZipFile(io.BytesIO(r.content))
	dir_unzip = DIR_ANNUAL_REPORTS / str(i)
	print(dir_unzip)
	dir_unzip.mkdir(exist_ok=True)
	z.extractall(dir_unzip)
	i += 1
	except BadZipFile:
	break


	def parse_recur(element, report):
	"""XML to dict"""
	key = None
	value = None
	for attrib_key in element.attrib:
	if attrib_key.endswith('key'):
	key = element.attrib[attrib_key]
	elif attrib_key.endswith('value'):
	value = element.attrib[attrib_key]
	if key and value:
	report[key] = value
	for child in element:
	parse_recur(child, report)
	return report


	def process_ar(path):
	"""Process annual report """
	tree = ET.parse(path)
	root = tree.getroot()
	report = {'filename': path.stem}
	for child in root:
	report = parse_recur(child, report)
	return report


	if __name__ == '__main__':
	download = input('download files (y/n) ')
	if download.lower() == 'y':
	print('downloading and unzipping register')
	register = get_register()
	print('downloading and unzipping annual reports')
	get_annual_reports()
	print('parsing xml files')
	for subdir in DIR_ANNUAL_REPORTS.glob('*'):
	if not subdir.is_dir():
	continue
	print(subdir)
	reports_list = [
	process_ar(path)
	for path
	in subdir.glob('*.xml')
	]
	reports_df = pd.DataFrame(reports_list)
	path_df = Path('../data/') / 'annual_reports_{}.csv'.format(subdir.stem)
	reports_df.to_csv(path_df, index=False)