Skip to content

Instantly share code, notes, and snippets.

@afternoon
Created October 2, 2011 22:22
Show Gist options
  • Save afternoon/1258046 to your computer and use it in GitHub Desktop.
Save afternoon/1258046 to your computer and use it in GitHub Desktop.
Scrape disease data from RKI SurvStat
#!/usr/bin/python
# vim: set fileencoding=utf-8
from csv import writer
from datetime import datetime
from selenium import webdriver
COUNTIES = [
(5334, 'SR Aachen'),
(7131, 'LK Ahrweiler'),
(9771, 'LK Aichach-Friedberg'),
(8425, 'LK Alb-Donau-Kreis'),
(16077, 'LK Altenburger Land'),
(7132, 'LK Altenkirchen'),
(15081, 'LK Altmarkkreis Salzwedel'),
(9171, 'LK Altötting'),
(7331, 'LK Alzey-Worms'),
(9371, 'LK Amberg-Sulzbach'),
(3451, 'LK Ammerland'),
(15082, 'LK Anhalt-Bitterfeld'),
(9571, 'LK Ansbach'),
(9671, 'LK Aschaffenburg'),
(9772, 'LK Augsburg'),
(3452, 'LK Aurich'),
(13051, 'LK Bad Doberan'),
(7332, 'LK Bad Dürkheim'),
(9672, 'LK Bad Kissingen'),
(7133, 'LK Bad Kreuznach'),
(9173, 'LK Bad Tölz-Wolfratshausen'),
(9471, 'LK Bamberg'),
(12060, 'LK Barnim'),
(14625, 'LK Bautzen'),
(9472, 'LK Bayreuth'),
(9172, 'LK Berchtesgadener Land'),
(6431, 'LK Bergstraße'),
(7231, 'LK Bernkastel-Wittlich'),
(8426, 'LK Biberach'),
(7134, 'LK Birkenfeld'),
(7232, 'LK Bitburg-Prüm'),
(8115, 'LK Böblingen'),
(8435, 'LK Bodenseekreis'),
(15083, 'LK Börde'),
(5554, 'LK Borken'),
(8315, 'LK Breisgau-Hochschwarzwald'),
(15084, 'LK Burgenland'),
(8235, 'LK Calw'),
(3351, 'LK Celle'),
(9372, 'LK Cham'),
(3453, 'LK Cloppenburg'),
(9473, 'LK Coburg'),
(7135, 'LK Cochem-Zell'),
(5558, 'LK Coesfeld'),
(3352, 'LK Cuxhaven'),
(9174, 'LK Dachau'),
(12061, 'LK Dahme-Spreewald'),
(6432, 'LK Darmstadt-Dieburg'),
(9271, 'LK Deggendorf'),
(13052, 'LK Demmin'),
(3251, 'LK Diepholz'),
(9773, 'LK Dillingen a.d.Donau'),
(9279, 'LK Dingolfing-Landau'),
(1051, 'LK Dithmarschen'),
(9779, 'LK Donau-Ries'),
(7333, 'LK Donnersbergkreis'),
(5358, 'LK Düren'),
(9175, 'LK Ebersberg'),
(16061, 'LK Eichsfeld'),
(9176, 'LK Eichstätt'),
(12062, 'LK Elbe-Elster'),
(8316, 'LK Emmendingen'),
(3454, 'LK Emsland'),
(5954, 'LK Ennepe-Ruhr-Kreis'),
(8236, 'LK Enzkreis'),
(9177, 'LK Erding'),
(5362, 'LK Erftkreis'),
(9572, 'LK Erlangen-Höchstadt'),
(14521, 'LK Erzgebirgskreis'),
(8116, 'LK Esslingen'),
(5366, 'LK Euskirchen'),
(9474, 'LK Forchheim'),
(9178, 'LK Freising'),
(8237, 'LK Freudenstadt'),
(9272, 'LK Freyung-Grafenau'),
(3455, 'LK Friesland'),
(6631, 'LK Fulda'),
(9179, 'LK Fürstenfeldbruck'),
(9573, 'LK Fürth'),
(9180, 'LK Garmisch-Partenkirchen'),
(7334, 'LK Germersheim'),
(6531, 'LK Gießen'),
(3151, 'LK Gifhorn'),
(8117, 'LK Göppingen'),
(14626, 'LK Görlitz'),
(3153, 'LK Goslar'),
(16067, 'LK Gotha'),
(3152, 'LK Göttingen'),
(3456, 'LK Grafschaft Bentheim'),
(16076, 'LK Greiz'),
(6433, 'LK Groß-Gerau'),
(9774, 'LK Günzburg'),
(13053, 'LK Güstrow'),
(5754, 'LK Gütersloh'),
(3252, 'LK Hameln-Pyrmont'),
(3353, 'LK Harburg'),
(15085, 'LK Harz'),
(9674, 'LK Haßberge'),
(12063, 'LK Havelland'),
(8135, 'LK Heidenheim'),
(8125, 'LK Heilbronn'),
(5370, 'LK Heinsberg'),
(3154, 'LK Helmstedt'),
(5758, 'LK Herford'),
(6632, 'LK Hersfeld-Rotenburg'),
(1053, 'LK Herzogtum Lauenburg'),
(16069, 'LK Hildburghausen'),
(3254, 'LK Hildesheim'),
(5958, 'LK Hochsauerlandkreis'),
(6434, 'LK Hochtaunuskreis'),
(9475, 'LK Hof'),
(8126, 'LK Hohenlohekreis'),
(3255, 'LK Holzminden'),
(5762, 'LK Höxter'),
(16070, 'LK Ilm-Kreis'),
(15086, 'LK Jerichower Land'),
(7335, 'LK Kaiserslautern'),
(8215, 'LK Karlsruhe'),
(6633, 'LK Kassel'),
(9273, 'LK Kelheim'),
(9675, 'LK Kitzingen'),
(5154, 'LK Kleve'),
(8335, 'LK Konstanz'),
(9476, 'LK Kronach'),
(9477, 'LK Kulmbach'),
(7336, 'LK Kusel'),
(16065, 'LK Kyffhäuserkreis'),
(6532, 'LK Lahn-Dill-Kreis'),
(9181, 'LK Landsberg a.Lech'),
(9274, 'LK Landshut'),
(3457, 'LK Leer'),
(14729, 'LK Leipzig'),
(9478, 'LK Lichtenfels'),
(6533, 'LK Limburg-Weilburg'),
(9776, 'LK Lindau'),
(5766, 'LK Lippe'),
(8336, 'LK Lörrach'),
(3354, 'LK Lüchow-Dannenberg'),
(8118, 'LK Ludwigsburg'),
(13054, 'LK Ludwigslust'),
(3355, 'LK Lüneburg'),
(6435, 'LK Main-Kinzig-Kreis'),
(9677, 'LK Main-Spessart'),
(8128, 'LK Main-Tauber-Kreis'),
(6436, 'LK Main-Taunus-Kreis'),
(7339, 'LK Mainz-Bingen'),
(15087, 'LK Mansfelder-Südharz'),
(6534, 'LK Marburg-Biedenkopf'),
(12064, 'LK Märkisch-Oderland'),
(5962, 'LK Märkischer Kreis'),
(7137, 'LK Mayen-Koblenz'),
(13055, 'LK Mecklenburg-Strelitz'),
(14627, 'LK Meißen'),
(10042, 'LK Merzig-Wadern'),
(5158, 'LK Mettmann'),
(9182, 'LK Miesbach'),
(9676, 'LK Miltenberg'),
(5770, 'LK Minden-Lübbecke'),
(14522, 'LK Mittelsachsen'),
(9183, 'LK Mühldorf a.Inn'),
(9184, 'LK München'),
(13056, 'LK Müritz'),
(8225, 'LK Neckar-Odenwald-Kreis'),
(9775, 'LK Neu-Ulm'),
(9185, 'LK Neuburg-Schrobenhausen'),
(9373, 'LK Neumarkt i.d.OPf.'),
(10043, 'LK Neunkirchen'),
(5162, 'LK Neuss'),
(9374, 'LK Neustadt a.d.Waldnaab'),
(9575, 'LK Neustadt/Aisch-Bad Windsheim'),
(7138, 'LK Neuwied'),
(3256, 'LK Nienburg (Weser)'),
(1054, 'LK Nordfriesland'),
(16062, 'LK Nordhausen'),
(14730, 'LK Nordsachsen'),
(13057, 'LK Nordvorpommern'),
(13058, 'LK Nordwestmecklenburg'),
(3155, 'LK Northeim'),
(9574, 'LK Nürnberger Land'),
(9780, 'LK Oberallgäu'),
(5374, 'LK Oberbergischer Kreis'),
(12065, 'LK Oberhavel'),
(12066, 'LK Oberspreewald-Lausitz'),
(6437, 'LK Odenwaldkreis'),
(12067, 'LK Oder-Spree'),
(6438, 'LK Offenbach'),
(3458, 'LK Oldenburg'),
(5966, 'LK Olpe'),
(8317, 'LK Ortenaukreis'),
(3459, 'LK Osnabrück'),
(8136, 'LK Ostalbkreis'),
(9777, 'LK Ostallgäu'),
(3356, 'LK Osterholz'),
(3156, 'LK Osterode am Harz'),
(1055, 'LK Ostholstein'),
(12068, 'LK Ostprignitz-Ruppin'),
(13059, 'LK Ostvorpommern'),
(5774, 'LK Paderborn'),
(13060, 'LK Parchim'),
(9275, 'LK Passau'),
(3157, 'LK Peine'),
(9186, 'LK Pfaffenhofen a.d.Ilm'),
(1056, 'LK Pinneberg'),
(1057, 'LK Plön'),
(12069, 'LK Potsdam-Mittelmark'),
(12070, 'LK Prignitz'),
(8216, 'LK Rastatt'),
(8436, 'LK Ravensburg'),
(5562, 'LK Recklinghausen'),
(9276, 'LK Regen'),
(9375, 'LK Regensburg'),
(8119, 'LK Rems-Murr-Kreis'),
(1058, 'LK Rendsburg-Eckernförde'),
(8415, 'LK Reutlingen'),
(7140, 'LK Rhein-Hunsrück-Kreis'),
(7141, 'LK Rhein-Lahn-Kreis'),
(8226, 'LK Rhein-Neckar-Kreis'),
(7338, 'LK Rhein-Pfalz-Kreis'),
(5382, 'LK Rhein-Sieg-Kreis'),
(6439, 'LK Rheingau-Taunus-Kreis'),
(5378, 'LK Rheinisch-Bergischer Kreis'),
(9673, 'LK Rhön-Grabfeld'),
(9187, 'LK Rosenheim'),
(3357, 'LK Rotenburg (Wümme)'),
(9576, 'LK Roth'),
(9277, 'LK Rottal-Inn'),
(8325, 'LK Rottweil'),
(13061, 'LK Rügen'),
(16074, 'LK Saale-Holzland-Kreis'),
(15088, 'LK Saalekreis'),
(16075, 'LK Saale-Orla-Kreis'),
(16073, 'LK Saalfeld-Rudolstadt'),
(10045, 'LK Saar-Pfalz-Kreis'),
(10044, 'LK Saarlouis'),
(14628, 'LK Sächsische Schweiz-Osterzgebirge'),
(15089, 'LK Salzland'),
(10046, 'LK Sankt Wendel'),
(3257, 'LK Schaumburg'),
(1059, 'LK Schleswig-Flensburg'),
(16066, 'LK Schmalkalden-Meiningen'),
(8127, 'LK Schwäbisch Hall'),
(6634, 'LK Schwalm-Eder-Kreis'),
(9376, 'LK Schwandorf'),
(8326, 'LK Schwarzwald-Baar-Kreis'),
(9678, 'LK Schweinfurt'),
(1060, 'LK Segeberg'),
(5970, 'LK Siegen-Wittgenstein'),
(8437, 'LK Sigmaringen'),
(5974, 'LK Soest'),
(3358, 'Heidekreis'),
(16068, 'LK Sömmerda'),
(16072, 'LK Sonneberg'),
(12071, 'LK Spree-Neiße'),
(3359, 'LK Stade'),
(10041, 'LK Stadtverband Saarbrücken'),
(9188, 'LK Starnberg'),
(1061, 'LK Steinburg'),
(5566, 'LK Steinfurt'),
(15090, 'LK Stendal'),
(1062, 'LK Stormarn'),
(9278, 'LK Straubing-Bogen'),
(7337, 'LK Südliche Weinstraße'),
(7340, 'LK Südwestpfalz'),
(12072, 'LK Teltow-Fläming'),
(9377, 'LK Tirschenreuth'),
(9189, 'LK Traunstein'),
(7235, 'LK Trier-Saarburg'),
(8416, 'LK Tübingen'),
(8327, 'LK Tuttlingen'),
(12073, 'LK Uckermark'),
(13062, 'LK Uecker-Randow'),
(3360, 'LK Uelzen'),
(5978, 'LK Unna'),
(16064, 'LK Unstrut-Hainich-Kreis'),
(9778, 'LK Unterallgäu'),
(3460, 'LK Vechta'),
(3361, 'LK Verden'),
(5166, 'LK Viersen'),
(6535, 'LK Vogelsbergkreis'),
(14523, 'LK Vogtlandkreis'),
(7233, 'LK Vulkaneifel'),
(6635, 'LK Waldeck-Frankenberg'),
(8337, 'LK Waldshut'),
(5570, 'LK Warendorf'),
(16063, 'LK Wartburgkreis'),
(9190, 'LK Weilheim-Schongau'),
(16071, 'LK Weimarer Land'),
(9577, 'LK Weißenburg-Gunzenhausen'),
(6636, 'LK Werra-Meißner-Kreis'),
(5170, 'LK Wesel'),
(3461, 'LK Wesermarsch'),
(7143, 'LK Westerwaldkreis'),
(6440, 'LK Wetteraukreis'),
(15091, 'LK Wittenberg'),
(3462, 'LK Wittmund'),
(3158, 'LK Wolfenbüttel'),
(9479, 'LK Wunsiedel i.Fichtelgebirge'),
(9679, 'LK Würzburg'),
(8417, 'LK Zollernalbkreis'),
(14524, 'LK Zwickau'),
(3241, 'Region Hannover'),
(5313, 'SK Aachen'),
(9361, 'SK Amberg'),
(9561, 'SK Ansbach'),
(9661, 'SK Aschaffenburg'),
(9761, 'SK Augsburg'),
(8211, 'SK Baden-Baden'),
(9461, 'SK Bamberg'),
(9462, 'SK Bayreuth'),
(11001, 'SK Berlin Mitte'),
(11002, 'SK Berlin Friedrichshain-Kreuzberg'),
(11003, 'SK Berlin Pankow'),
(11004, 'SK Berlin Charlottenburg-Wilmersdorf'),
(11005, 'SK Berlin Spandau'),
(11006, 'SK Berlin Steglitz-Zehlendorf'),
(11007, 'SK Berlin Tempelhof-Schöneberg'),
(11008, 'SK Berlin Neukölln'),
(11009, 'SK Berlin Treptow-Köpenick'),
(11010, 'SK Berlin Marzahn-Hellersdorf'),
(11011, 'SK Berlin Lichtenberg'),
(11012, 'SK Berlin Reinickendorf'),
(11099, 'SK Berlin (Bezirk nicht zuordenbar)'),
(5711, 'SK Bielefeld'),
(5911, 'SK Bochum'),
(5314, 'SK Bonn'),
(5512, 'SK Bottrop'),
(12051, 'SK Brandenburg a.d.Havel'),
(3101, 'SK Braunschweig'),
(4011, 'SK Bremen'),
(4012, 'SK Bremerhaven'),
(14511, 'SK Chemnitz'),
(9463, 'SK Coburg'),
(12052, 'SK Cottbus'),
(6411, 'SK Darmstadt'),
(3401, 'SK Delmenhorst'),
(15001, 'SK Dessau-Roßlau'),
(5913, 'SK Dortmund'),
(14612, 'SK Dresden'),
(5112, 'SK Duisburg'),
(5111, 'SK Düsseldorf'),
(16056, 'SK Eisenach'),
(3402, 'SK Emden'),
(16051, 'SK Erfurt'),
(9562, 'SK Erlangen'),
(5113, 'SK Essen'),
(1001, 'SK Flensburg'),
(7311, 'SK Frankenthal'),
(12053, 'SK Frankfurt (Oder)'),
(6412, 'SK Frankfurt am Main'),
(8311, 'SK Freiburg i.Breisgau'),
(9563, 'SK Fürth'),
(5513, 'SK Gelsenkirchen'),
(16052, 'SK Gera'),
(13001, 'SK Greifswald'),
(5914, 'SK Hagen'),
(15002, 'SK Halle (Saale)'),
(2000, 'SK Hamburg'),
(5915, 'SK Hamm'),
(8221, 'SK Heidelberg'),
(8121, 'SK Heilbronn'),
(5916, 'SK Herne'),
(9464, 'SK Hof'),
(9161, 'SK Ingolstadt'),
(16053, 'SK Jena'),
(7312, 'SK Kaiserslautern'),
(8212, 'SK Karlsruhe'),
(6611, 'SK Kassel'),
(9762, 'SK Kaufbeuren'),
(9763, 'SK Kempten'),
(1002, 'SK Kiel'),
(7111, 'SK Koblenz'),
(5315, 'SK Köln'),
(5114, 'SK Krefeld'),
(7313, 'SK Landau i.d.Pfalz'),
(9261, 'SK Landshut'),
(14713, 'SK Leipzig'),
(5316, 'SK Leverkusen'),
(1003, 'SK Lübeck'),
(7314, 'SK Ludwigshafen'),
(15003, 'SK Magdeburg'),
(7315, 'SK Mainz'),
(8222, 'SK Mannheim'),
(9764, 'SK Memmingen'),
(5116, 'SK Mönchengladbach'),
(5117, 'SK Mülheim a.d.Ruhr'),
(9162, 'SK München'),
(5515, 'SK Münster'),
(13002, 'SK Neubrandenburg'),
(1004, 'SK Neumünster'),
(7316, 'SK Neustadt a.d.Weinstraße'),
(9564, 'SK Nürnberg'),
(5119, 'SK Oberhausen'),
(6413, 'SK Offenbach'),
(3403, 'SK Oldenburg'),
(3404, 'SK Osnabrück'),
(9262, 'SK Passau'),
(8231, 'SK Pforzheim'),
(7317, 'SK Pirmasens'),
(12054, 'SK Potsdam'),
(9362, 'SK Regensburg'),
(5120, 'SK Remscheid'),
(9163, 'SK Rosenheim'),
(13003, 'SK Rostock'),
(3102, 'SK Salzgitter'),
(9565, 'SK Schwabach'),
(9662, 'SK Schweinfurt'),
(13004, 'SK Schwerin'),
(5122, 'SK Solingen'),
(7318, 'SK Speyer'),
(13005, 'SK Stralsund'),
(9263, 'SK Straubing'),
(8111, 'SK Stuttgart'),
(16054, 'SK Suhl'),
(7211, 'SK Trier'),
(8421, 'SK Ulm'),
(9363, 'SK Weiden i.d.OPf.'),
(16055, 'SK Weimar'),
(6414, 'SK Wiesbaden'),
(3405, 'SK Wilhelmshaven'),
(13006, 'SK Wismar'),
(3103, 'SK Wolfsburg'),
(7319, 'SK Worms'),
(5124, 'SK Wuppertal'),
(9663, 'SK Würzburg'),
(7320, 'SK Zweibrücken'),
]
def data(driver, disease='---', year='--- Alle ---', county='0', gender='0',
stratification='27', age_group='-1', crosstab="Woche"):
'''Get RKI disease occurance data for a given disease, year, county and
gender.
Defaults query is all diseases, years, counties and sexes. Values for county
should be the county ID number from the form, e.g. 5334 for SR Aachen. Value
for gender should be 0: both, 1: male, 2: female.
Return value is a dict containing diseases with each value being a dict of
year to count.
>>> driver = webdriver.Firefox()
>>> driver.implicitly_wait(10)
Get FSME data for men in SR Aachen:
>>> data(driver, 'FSV', county=8425, gender=1)
{'FSME': {2001: 0, 2002: 0, 2003: 0, 2004: 0, 2005: 0, 2006: 0, 2007: 0, 2008: 1, 2009: 0, 2010: 1, 2011: 0}}
Get FSME data for all years, counties and sexes:
>>> d = data(driver)
>>> d['FSME'][2001]
256
>>> driver.quit()
Data sourced from http://www3.rki.de/survstat/QueryForm.aspx
'''
print '%s disease=%s year=%s county=%s gender=%s age_group=%s' % \
(datetime.isoformat(datetime.now(), ' '), disease, year, county, gender, age_group)
# create Selenium object to interact with site
driver.get('http://www3.rki.de/survstat/QueryForm.aspx')
# set lang to English to ease debugging
driver.find_element_by_css_selector('#imgbtnEnglish').click()
# disease
if disease != '---':
driver.find_element_by_css_selector('#webTab__ctl0_ctrlDisease_Migrated_lbMeldekategorie > option[value="---"]').click()
driver.find_element_by_css_selector('#webTab__ctl0_ctrlDisease_Migrated_lbMeldekategorie > option[value="%s"]' % disease).click()
# year
if year != '2012':
driver.find_element_by_id('webTabtd1').click()
driver.find_element_by_css_selector('#webTab__ctl1_ctrlTime_Migrated_lbMeldejahr > option[value="2012"]').click()
driver.find_element_by_css_selector('#webTab__ctl1_ctrlTime_Migrated_lbMeldejahr > option[value="%s"]' % year).click()
# county
if county != '0':
driver.find_element_by_id('webTabtd2').click()
driver.find_element_by_css_selector('#webTab__ctl2_ctrlPlace_Migrated_lbLandkreis > option[value="0"]').click()
driver.find_element_by_css_selector('#webTab__ctl2_ctrlPlace_Migrated_lbLandkreis > option[value="%s"]' % county).click()
driver.find_element_by_id('webTabtd3').click()
# gender
if gender != '0':
driver.find_element_by_css_selector('#webTab__ctl3_ctrlPerson_Migrated_lbGeschlecht > option[value="0"]').click()
driver.find_element_by_css_selector('#webTab__ctl3_ctrlPerson_Migrated_lbGeschlecht > option[value="%s"]' % gender).click()
# age group
driver.find_element_by_css_selector('#webTab__ctl3_ctrlPerson_Migrated_ddlAltersschichtung > option[value="%s"]' % stratification).click()
if age_group != '-1':
driver.find_element_by_css_selector('#webTab__ctl3_ctrlPerson_Migrated_lbAltersgruppe > option[value="-1"]').click()
driver.find_element_by_css_selector('#webTab__ctl3_ctrlPerson_Migrated_lbAltersgruppe > option[value="%s"]' % age_group).click()
# crosstab
driver.find_element_by_id('webTabtd5').click()
driver.find_element_by_css_selector('#webTab__ctl5_ctrlDisplay_Migrated_ddlCrosstab > option[value="query.%s"]' % crosstab).click()
# get results
driver.find_element_by_id('btnSend').click()
driver.get('http://www3.rki.de/survstat/ResultList.aspx')
data = driver.find_element_by_css_selector('#dataGridResult').text
# parse data and return
results = {}
lines = data.split('\n')
keys = [int_if_possible(k) for k in lines[0].split()[1:]]
for line in lines[1:]:
data = line.split()
values = [int_if_possible(v) for v in data[1:]]
results[data[0]] = dict(zip(keys, values))
return results
def int_if_possible(v):
'''Convert value to integer if possible.
>>> int_if_possible('27')
27
>>> int_if_possible('Hemingway')
'Hemingway'
'''
try:
return int(v)
except ValueError:
return v
def fsme_by_week_year_gender_county_coarse_age_group(driver, datafile):
datawriter = writer(datafile)
datawriter.writerow(('county_id', 'county_name', 'year', 'week', 'gender', 'age_group', 'value'))
for county_id, county_name in COUNTIES:
for year in range(2001, 2010):
for age_group in (0, 15, 20, 25, 30, 40, 50, 60, 70):
for gender in (1, 2):
r = data(driver, disease='FSV', year=year, county=county_id, gender=gender,
age_group=age_group)
for week, value in r['FSME'].iteritems():
datawriter.writerow((county_id, county_name, year, week, gender, age_group, value))
datafile.flush()
if __name__ == '__main__':
try:
driver = webdriver.Firefox()
driver.implicitly_wait(5) # wait up to 5s for pages to load
datafile = open('data.csv', 'wb')
fsme_by_week_year_gender_county_coarse_age_group(driver, datafile)
finally:
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment