Skip to content

Instantly share code, notes, and snippets.

@DIRKMJK
Created May 23, 2021 07:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DIRKMJK/4944cd7ddd00f323d8f29fb4e60739d6 to your computer and use it in GitHub Desktop.
Save DIRKMJK/4944cd7ddd00f323d8f29fb4e60739d6 to your computer and use it in GitHub Desktop.
Download open data from the Dutch Company Register, unzip and store as csv
"""Download open data from the Dutch Company Register, unzip and store as csv.
Not guaranteed to yield complete and accurate data. For background see:
https://dirkmjk.nl/en/187/open-company-data-in-the-netherlands
"""
from pathlib import Path
import io
import zipfile
from zipfile import BadZipFile
import xml.etree.ElementTree as ET
import requests
import pandas as pd
URL_REGISTER = 'https://static.kvk.nl/download/kvk-open-data-set-handelsregister.zip'
BASE_URL_REPORTS = 'https://static.kvk.nl/download/kvk-open-data-set-jaarrekeningen{}.zip'
PATH_REGISTER = '../data/kvk-open-data-set-handelsregister.csv'
DIR_ANNUAL_REPORTS = Path('../data/annual_reports')
def get_register():
"""Download and process company register"""
r = requests.get(URL_REGISTER)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('../data/')
register = pd.read_csv(PATH_REGISTER, encoding='latin-1')
register.to_csv(PATH_REGISTER, encoding='utf8')
def get_annual_reports():
"""Download and unzip annual reports"""
i = 0
while True:
url = BASE_URL_REPORTS.format(i)
r = requests.get(url)
try:
z = zipfile.ZipFile(io.BytesIO(r.content))
dir_unzip = DIR_ANNUAL_REPORTS / str(i)
print(dir_unzip)
dir_unzip.mkdir(exist_ok=True)
z.extractall(dir_unzip)
i += 1
except BadZipFile:
break
def parse_recur(element, report):
"""XML to dict"""
key = None
value = None
for attrib_key in element.attrib:
if attrib_key.endswith('key'):
key = element.attrib[attrib_key]
elif attrib_key.endswith('value'):
value = element.attrib[attrib_key]
if key and value:
report[key] = value
for child in element:
parse_recur(child, report)
return report
def process_ar(path):
"""Process annual report """
tree = ET.parse(path)
root = tree.getroot()
report = {'filename': path.stem}
for child in root:
report = parse_recur(child, report)
return report
if __name__ == '__main__':
download = input('download files (y/n) ')
if download.lower() == 'y':
print('downloading and unzipping register')
register = get_register()
print('downloading and unzipping annual reports')
get_annual_reports()
print('parsing xml files')
for subdir in DIR_ANNUAL_REPORTS.glob('*'):
if not subdir.is_dir():
continue
print(subdir)
reports_list = [
process_ar(path)
for path
in subdir.glob('*.xml')
]
reports_df = pd.DataFrame(reports_list)
path_df = Path('../data/') / 'annual_reports_{}.csv'.format(subdir.stem)
reports_df.to_csv(path_df, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment