Skip to content

Instantly share code, notes, and snippets.

@fnielsen
Last active August 29, 2015 14:08
Show Gist options
  • Save fnielsen/56f2ed7c1f35d72762d1 to your computer and use it in GitHub Desktop.
Save fnielsen/56f2ed7c1f35d72762d1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:fe2e9b256bf43cc51353e9b5c421d862646da62e5ff23279534976ad5686a7ef"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Analysis of Danish financial statements\n",
"===================================\n",
"\n",
"- Data from 'Erhvervsstyrelsen'.\n",
"- http://datahub.virk.dk/dataset/regnskabsdata-fra-selskaber-sample\n",
"- Sample of some hundreds of companies\n",
"- Data in zipped XBRL XML format\n",
"- http://datahub.virk.dk/sites/default/files/storage/1000_digitale_aarsrapporter.zip"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from lxml import etree\n",
"import os\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# The data was unzipped in this directory:\n",
"dirname = '/home/fnielsen/data/virkdk/1000_digitale_aarsrapporter'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Attempt on a dirty reading of the data\n",
"# Not necessarily pretty, - including some strange workarounds\n",
"rows= []\n",
"for filename in os.listdir(dirname):\n",
" if not filename.endswith('.xml'):\n",
" continue\n",
" try:\n",
" tree = etree.parse(open(os.path.join(dirname, filename)))\n",
" except: \n",
" # There is XML error in the XML because of spaces!?\n",
" continue\n",
" d = {}\n",
" for element in tree.findall('/'):\n",
" if not isinstance(element, etree._Comment) and '{http://xbrl.dcca.dk/' in element.tag:\n",
" tag = element.tag[element.tag.index('}') + 1:]\n",
" context = element.attrib['contextRef']\n",
" if element.text is None:\n",
" # For some reason TypeError is not caught!?\n",
" value = None\n",
" else:\n",
" try: \n",
" value = float(element.text)\n",
" except TypeError:\n",
" value = element.text\n",
" except ValueError:\n",
" value = element.text\n",
" d[tag] = value\n",
" rows.append(d)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = pd.DataFrame(rows)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df['NameAndSurnameOfChairmanOfGeneralMeeting'].value_counts()[:12]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"Nils B. Bonde 7\n",
"Michael Vinther 7\n",
"Jens Tange M\u00f8llmann 5\n",
"Kim Larsen 5\n",
"Henrik Klougart 4\n",
"Uffe Martin Jensen 4\n",
"Henrik Rasmussen 4\n",
"Merete Lundbye M\u00f8ller 4\n",
"Johannes Nielsen 4\n",
"Poul-Erik Vind 4\n",
"S\u00f8ren Bruun Rasmussen 3\n",
"Thomas Folmann 3\n",
"dtype: int64"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df['NameOfAuditFirm'].value_counts()[:12]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"Deloitte Statsautoriseret Revisionspartnerselskab 75\n",
"PricewaterhouseCoopers Statsautoriseret Revisionspartnerselskab 63\n",
"Beierholm 52\n",
"BDO Statsautoriseret revisionsaktieselskab 52\n",
"KPMG 26\n",
"AP | Statsautoriserede Revisorer P/S, 17\n",
"Redmark, Statsautoriseret Revisionspartnerselskab 15\n",
"KPMG Statsautoriseret Revisionspartnerselskab 13\n",
"GLB REVISION, Statsautoriserede Revisorer A/S 12\n",
"CHRISTENSEN KJ\u00c6RULFF, statsautoriseret revisionsaktieselskab 12\n",
"Partner Revision, statsautoriseret revisionsaktieselskab 11\n",
" 8\n",
"dtype: int64"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment