Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Madrid Polen Crawler
"""
This is a script for parsing polen levels in Madrid, Spain.
It used to work in 2019 but you may need to make fixes to make it work in the following years.
"""
import re
import locale
import time
import logging
import requests
import pandas as pd
from bs4 import BeautifulSoup
import coloredlogs, logging
coloredlogs.install()
logger = logging.getLogger("Polen Crawler")
locale.setlocale(locale.LC_TIME, "es_ES")
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield tuple(l[i:i + n])
def format_name(name):
return name.replace('á', 'á').replace('í', 'í')
def process_chunk(chunk):
type_ = format_name(chunk[0].strip())
amount = chunk[1].strip()
amount = re.sub(r'(?<=\d)[,\.](?=\d)','', amount)
level, threshold = chunk[2].strip().split(" (")
threshold = threshold.strip()[:-1]
return (type_, amount, level, threshold)
cities = {
'Madrid-Arganzuela': 'X=262&Y=299',
'Ciudad Universiteria': 'X=249&Y=282',
'Las Rozas': 'X=199&Y=252',
'Collado Villalba': 'X=171&Y=203',
'Alcobendas': 'X=282&Y=242',
'Salamanca': 'X=264&Y=279',
'Coslada': 'X=304&Y=283',
'Alcala de Henares': 'X=357&Y=262',
'Getafe': 'X=252&Y=332',
'Aranjuez': 'X=290&Y=441'
}
base = 'http://gestiona.madrid.org/geoserver/wms?SERVICE=WMS&VERSION=1.1.1&REQUEST=GetFeatureInfo&LAYERS=SPOL_V_CAPTADORES_GIS&QUERY_LAYERS=SPOL_V_CAPTADORES_GIS&STYLES=&BBOX=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528&FEATURE_COUNT=50&HEIGHT=493&WIDTH=450&FORMAT=image%2Fpng&INFO_FORMAT=text%2Fhtml&SRS=EPSG%3A23030&'
columns = ['date', 'point', 'type', 'amount', 'level', 'threshold']
try:
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
df = pd.read_csv("polen.csv", parse_dates=['date'], date_parser=dateparse)
except FileNotFoundError as ex:
df = pd.DataFrame(columns=columns)
failed = set()
for city, arg in cities.items():
logger.info(f"Fetching {city}...")
html_doc = requests.get(base + arg).text
logger.info(f"Parsing {city}...")
soup = BeautifulSoup(html_doc, 'html.parser')
results = [label.get_text() for label in soup.find_all('label')]
if results:
point = results[1].strip()
date = results[3].strip()
new_rows = pd.DataFrame([[date, point, *process_chunk(chunk)] for chunk in chunks(results[7::], 3)], columns=columns)
new_rows.date = pd.to_datetime(new_rows.date, format='%d-%b-%Y')
if not new_rows.empty:
df = pd.concat([df, new_rows], sort=True)
num_rows = new_rows.shape[0]
logger.info(f"Appended {num_rows} rows to the records")
else:
failed.add(city)
else:
logger.error(f"Could not download {city}")
failed.add(city)
time.sleep(3)
logger.info("Processing concatenated data...")
df = df.drop_duplicates(subset=['date', 'point', 'type']).reset_index(drop=True)
df = df.sort_values(by=['date', 'point', 'type'])
df = df.reset_index(drop=True)
df = df[columns]
logger.info("Writing to csv file...")
df.to_csv("polen.csv", index=False)
logger.info("Finished!")
if failed:
cities = ", ".join(failed)
logger.error("Following points failed: {cities}")
# JSON: http http://gestiona.madrid.org/geoserver/wms\?SERVICE\=WMS\&VERSION\=1.1.1\&REQUEST\=GetFeatureInfo\&LAYERS\=SPOL_V_CAPTADORES_GIS\&QUERY_LAYERS\=SPOL_V_CAPTADORES_GIS\&STYLES\=\&BBOX\=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528\&FEATURE_COUNT\=50\&HEIGHT\=493\&WIDTH\=450\&FORMAT\=image%2Fpng\&INFO_FORMAT\=application%2Fjson\&SRS\=EPSG%3A23030\&X\=238\&Y\=325
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment