Last active
August 29, 2015 13:58
-
-
Save johnjohndoe/10391569 to your computer and use it in GitHub Desktop.
Pegelstände-Scraper für Sachsen, http://scraperwiki.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import scraperwiki | |
import requests | |
import lxml.html | |
from collections import namedtuple | |
from datetime import datetime | |
WaterLevelData = namedtuple("WaterLevelData", "station_name date_time water_level flow_rate") | |
def retrieve_page_urls(url): | |
html = requests.get(url).content | |
dom = lxml.html.fromstring(html) | |
urls = [] | |
ankers = dom.cssselect('body div a') | |
for anker in ankers: | |
href = anker.get('href') | |
# Exclude odd ankers | |
onmouseover = anker.get('onmouseover') | |
if onmouseover is not None: | |
urls.append('http://www.umwelt.sachsen.de/de/wu/umwelt/lfug/lfug-internet/hwz/' + href) | |
return urls | |
def retrieve_water_level_datas(url): | |
html = requests.get(url).content | |
dom = lxml.html.fromstring(html) | |
station_name = retrieve_station_name(dom) | |
rahmen_tables = dom.cssselect('html body table table.rahmen') | |
# Use the last "rahmen" table on the page | |
water_level_table_index = len(rahmen_tables) - 1 | |
water_level_table = rahmen_tables[water_level_table_index] | |
rows = water_level_table.cssselect('tr') | |
water_level_datas = [] | |
# Skip table header | |
for row_index in range(1, len(rows)): | |
row = rows[row_index] | |
cells = row.cssselect('td') | |
date_time = cells[0].text_content() | |
date_time = datetime.strptime(date_time, '%d.%m.%Y %H:%M') | |
water_level = cells[1].text_content() | |
water_level = int(water_level) | |
flow_rate = cells[2].text_content() | |
try: | |
flow_rate = float(flow_rate.replace(',','.')) | |
except: | |
# Sometime the value is "k.A." | |
flow_rate = 0 | |
# print date_time + ", " + water_level + ", " + flow_rate | |
water_level_data = WaterLevelData(station_name, date_time, water_level, flow_rate) | |
# print water_level_data | |
water_level_datas.append(water_level_data) | |
return water_level_datas | |
def retrieve_station_name(dom): | |
station_name = dom.cssselect('body table span.titel')[0].text_content() | |
station_name = station_name.replace(u'\xa0', u' ') | |
station_name = station_name.replace(u'Pegel: ', u'') | |
return station_name | |
def store_water_level_data(item): | |
try: | |
scraperwiki.sqlite.execute(""" | |
create table water_levels_saxony | |
( | |
id INTEGER PRIMARY KEY AUTOINCREMENT | |
) | |
""") | |
except: | |
pass | |
# print "Table probably already exists." | |
unique_keys = [ 'station_name', 'date_time' ] | |
data = { | |
'station_name' : item.station_name, | |
'date_time' : item.date_time, | |
'water_level' : item.water_level, | |
'flow_rate' : item.flow_rate | |
} | |
scraperwiki.sql.save(unique_keys, data, table_name = 'water_levels_saxony') | |
def store_water_level_datas(items): | |
for item in items: | |
store_water_level_data(item) | |
urls = retrieve_page_urls("http://www.umwelt.sachsen.de/de/wu/umwelt/lfug/lfug-internet/hwz/inhalt_re.html") | |
for url in urls: | |
water_level_datas = retrieve_water_level_datas(url) | |
store_water_level_datas(water_level_datas) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment