Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@EmilStenstrom
Created March 16, 2015 12:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EmilStenstrom/79890eefccb92bd3cf30 to your computer and use it in GitHub Desktop.
Save EmilStenstrom/79890eefccb92bd3cf30 to your computer and use it in GitHub Desktop.
Script to scrape data from Socialstyrelsens database over death statistics. You need to specify which codes to fetch via that codes parameter in main().
# -*- coding: utf-8 -*-
from lxml.html import fromstring
import os
import csv
import requests
def _get_payload_for_code(code):
parameters = {
"i_%s_3" % code: "on",
"visaAG": "on",
"hvDIA": ";%s;" % code,
"hvOMR": ";1;;3;;4;;5;;6;;7;;8;;9;;10;;12;;13;;14;;17;;18;;19;;20;;21;;22;;23;;24;;25;",
"vKON": ";1;;2;",
"vMATT": ";1;",
"vAR": ";2013;",
"vAGI": ";1;;2;;3;;4;;5;;6;;7;;8;;9;;10;;11;;12;;13;;14;;15;;16;;17;;18;",
}
return parameters
def _create_file_from_scaped_data(filename, data):
if os.path.isfile(filename):
return
print "Creating %s" % filename
with open(filename, 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=';')
for row in data:
encoded_row = []
for cell in row:
if isinstance(cell, unicode):
encoded_row.append(cell.encode("utf-8"))
else:
encoded_row.append(cell)
writer.writerow(encoded_row)
def _data_from_html(html):
doc = fromstring(html)
tables = doc.cssselect("table")
if not tables:
return None
table = tables[0]
data = []
# Header
data.append([el.text for el in table.cssselect("th font a font")])
# Rows
rows = table.cssselect("tr")
for row in rows[1:]:
data.append([el.text if el.text != "--" else "0" for el in row.cssselect("td font")])
return data
def main():
url = "http://192.137.163.49/sdb/if_dor/resultat.aspx"
codes = ["C00", "C01"]
for code in codes:
payload = _get_payload_for_code(code)
print "Fetching code %s" % code
response = requests.post(url, data=payload)
if not response.status_code == 200:
print "Error getting code %s" % code
return
data = _data_from_html(response.content)
if data:
filename = os.path.join("data", "codes", code + ".csv")
_create_file_from_scaped_data(filename, data)
if __name__ == "__main__":
main()
@HenrikPassmark
Copy link

For reference.
"Assuming the HTML structure of the table is similar between different tables, it might be possible to re-purpose this script for subjects other than cause of death statistics. Headlines are stored in TH tags, and data in TR TD tags. "

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment