Download open data from the Dutch Company Register, unzip and store as csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Download open data from the Dutch Company Register, unzip and store as csv. | |
Not guaranteed to yield complete and accurate data. For background see: | |
https://dirkmjk.nl/en/187/open-company-data-in-the-netherlands | |
""" | |
from pathlib import Path | |
import io | |
import zipfile | |
from zipfile import BadZipFile | |
import xml.etree.ElementTree as ET | |
import requests | |
import pandas as pd | |
URL_REGISTER = 'https://static.kvk.nl/download/kvk-open-data-set-handelsregister.zip' | |
BASE_URL_REPORTS = 'https://static.kvk.nl/download/kvk-open-data-set-jaarrekeningen{}.zip' | |
PATH_REGISTER = '../data/kvk-open-data-set-handelsregister.csv' | |
DIR_ANNUAL_REPORTS = Path('../data/annual_reports') | |
def get_register(): | |
"""Download and process company register""" | |
r = requests.get(URL_REGISTER) | |
z = zipfile.ZipFile(io.BytesIO(r.content)) | |
z.extractall('../data/') | |
register = pd.read_csv(PATH_REGISTER, encoding='latin-1') | |
register.to_csv(PATH_REGISTER, encoding='utf8') | |
def get_annual_reports(): | |
"""Download and unzip annual reports""" | |
i = 0 | |
while True: | |
url = BASE_URL_REPORTS.format(i) | |
r = requests.get(url) | |
try: | |
z = zipfile.ZipFile(io.BytesIO(r.content)) | |
dir_unzip = DIR_ANNUAL_REPORTS / str(i) | |
print(dir_unzip) | |
dir_unzip.mkdir(exist_ok=True) | |
z.extractall(dir_unzip) | |
i += 1 | |
except BadZipFile: | |
break | |
def parse_recur(element, report): | |
"""XML to dict""" | |
key = None | |
value = None | |
for attrib_key in element.attrib: | |
if attrib_key.endswith('key'): | |
key = element.attrib[attrib_key] | |
elif attrib_key.endswith('value'): | |
value = element.attrib[attrib_key] | |
if key and value: | |
report[key] = value | |
for child in element: | |
parse_recur(child, report) | |
return report | |
def process_ar(path): | |
"""Process annual report """ | |
tree = ET.parse(path) | |
root = tree.getroot() | |
report = {'filename': path.stem} | |
for child in root: | |
report = parse_recur(child, report) | |
return report | |
if __name__ == '__main__': | |
download = input('download files (y/n) ') | |
if download.lower() == 'y': | |
print('downloading and unzipping register') | |
register = get_register() | |
print('downloading and unzipping annual reports') | |
get_annual_reports() | |
print('parsing xml files') | |
for subdir in DIR_ANNUAL_REPORTS.glob('*'): | |
if not subdir.is_dir(): | |
continue | |
print(subdir) | |
reports_list = [ | |
process_ar(path) | |
for path | |
in subdir.glob('*.xml') | |
] | |
reports_df = pd.DataFrame(reports_list) | |
path_df = Path('../data/') / 'annual_reports_{}.csv'.format(subdir.stem) | |
reports_df.to_csv(path_df, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment