Skip to content

Instantly share code, notes, and snippets.

@johnjohndoe
Last active August 29, 2015 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save johnjohndoe/10391569 to your computer and use it in GitHub Desktop.
Save johnjohndoe/10391569 to your computer and use it in GitHub Desktop.
Pegelstände-Scraper für Sachsen, http://scraperwiki.com
#!/usr/bin/env python
import scraperwiki
import requests
import lxml.html
from collections import namedtuple
from datetime import datetime
WaterLevelData = namedtuple("WaterLevelData", "station_name date_time water_level flow_rate")
def retrieve_page_urls(url):
html = requests.get(url).content
dom = lxml.html.fromstring(html)
urls = []
ankers = dom.cssselect('body div a')
for anker in ankers:
href = anker.get('href')
# Exclude odd ankers
onmouseover = anker.get('onmouseover')
if onmouseover is not None:
urls.append('http://www.umwelt.sachsen.de/de/wu/umwelt/lfug/lfug-internet/hwz/' + href)
return urls
def retrieve_water_level_datas(url):
html = requests.get(url).content
dom = lxml.html.fromstring(html)
station_name = retrieve_station_name(dom)
rahmen_tables = dom.cssselect('html body table table.rahmen')
# Use the last "rahmen" table on the page
water_level_table_index = len(rahmen_tables) - 1
water_level_table = rahmen_tables[water_level_table_index]
rows = water_level_table.cssselect('tr')
water_level_datas = []
# Skip table header
for row_index in range(1, len(rows)):
row = rows[row_index]
cells = row.cssselect('td')
date_time = cells[0].text_content()
date_time = datetime.strptime(date_time, '%d.%m.%Y %H:%M')
water_level = cells[1].text_content()
water_level = int(water_level)
flow_rate = cells[2].text_content()
try:
flow_rate = float(flow_rate.replace(',','.'))
except:
# Sometime the value is "k.A."
flow_rate = 0
# print date_time + ", " + water_level + ", " + flow_rate
water_level_data = WaterLevelData(station_name, date_time, water_level, flow_rate)
# print water_level_data
water_level_datas.append(water_level_data)
return water_level_datas
def retrieve_station_name(dom):
station_name = dom.cssselect('body table span.titel')[0].text_content()
station_name = station_name.replace(u'\xa0', u' ')
station_name = station_name.replace(u'Pegel: ', u'')
return station_name
def store_water_level_data(item):
try:
scraperwiki.sqlite.execute("""
create table water_levels_saxony
(
id INTEGER PRIMARY KEY AUTOINCREMENT
)
""")
except:
pass
# print "Table probably already exists."
unique_keys = [ 'station_name', 'date_time' ]
data = {
'station_name' : item.station_name,
'date_time' : item.date_time,
'water_level' : item.water_level,
'flow_rate' : item.flow_rate
}
scraperwiki.sql.save(unique_keys, data, table_name = 'water_levels_saxony')
def store_water_level_datas(items):
for item in items:
store_water_level_data(item)
urls = retrieve_page_urls("http://www.umwelt.sachsen.de/de/wu/umwelt/lfug/lfug-internet/hwz/inhalt_re.html")
for url in urls:
water_level_datas = retrieve_water_level_datas(url)
store_water_level_datas(water_level_datas)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment