Skip to content

Instantly share code, notes, and snippets.

@aladagemre
Created July 12, 2020 12:38
Show Gist options
  • Save aladagemre/0198813568a236775e4f2c6a46d4fa25 to your computer and use it in GitHub Desktop.
Save aladagemre/0198813568a236775e4f2c6a46d4fa25 to your computer and use it in GitHub Desktop.
Madrid Polen Crawler
"""
This is a script for parsing polen levels in Madrid, Spain.
It used to work in 2019 but you may need to make fixes to make it work in the following years.
"""
import re
import locale
import time
import logging
import requests
import pandas as pd
from bs4 import BeautifulSoup
import coloredlogs, logging
coloredlogs.install()
logger = logging.getLogger("Polen Crawler")
locale.setlocale(locale.LC_TIME, "es_ES")
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield tuple(l[i:i + n])
def format_name(name):
return name.replace('á', 'á').replace('í', 'í')
def process_chunk(chunk):
type_ = format_name(chunk[0].strip())
amount = chunk[1].strip()
amount = re.sub(r'(?<=\d)[,\.](?=\d)','', amount)
level, threshold = chunk[2].strip().split(" (")
threshold = threshold.strip()[:-1]
return (type_, amount, level, threshold)
cities = {
'Madrid-Arganzuela': 'X=262&Y=299',
'Ciudad Universiteria': 'X=249&Y=282',
'Las Rozas': 'X=199&Y=252',
'Collado Villalba': 'X=171&Y=203',
'Alcobendas': 'X=282&Y=242',
'Salamanca': 'X=264&Y=279',
'Coslada': 'X=304&Y=283',
'Alcala de Henares': 'X=357&Y=262',
'Getafe': 'X=252&Y=332',
'Aranjuez': 'X=290&Y=441'
}
base = 'http://gestiona.madrid.org/geoserver/wms?SERVICE=WMS&VERSION=1.1.1&REQUEST=GetFeatureInfo&LAYERS=SPOL_V_CAPTADORES_GIS&QUERY_LAYERS=SPOL_V_CAPTADORES_GIS&STYLES=&BBOX=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528&FEATURE_COUNT=50&HEIGHT=493&WIDTH=450&FORMAT=image%2Fpng&INFO_FORMAT=text%2Fhtml&SRS=EPSG%3A23030&'
columns = ['date', 'point', 'type', 'amount', 'level', 'threshold']
try:
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
df = pd.read_csv("polen.csv", parse_dates=['date'], date_parser=dateparse)
except FileNotFoundError as ex:
df = pd.DataFrame(columns=columns)
failed = set()
for city, arg in cities.items():
logger.info(f"Fetching {city}...")
html_doc = requests.get(base + arg).text
logger.info(f"Parsing {city}...")
soup = BeautifulSoup(html_doc, 'html.parser')
results = [label.get_text() for label in soup.find_all('label')]
if results:
point = results[1].strip()
date = results[3].strip()
new_rows = pd.DataFrame([[date, point, *process_chunk(chunk)] for chunk in chunks(results[7::], 3)], columns=columns)
new_rows.date = pd.to_datetime(new_rows.date, format='%d-%b-%Y')
if not new_rows.empty:
df = pd.concat([df, new_rows], sort=True)
num_rows = new_rows.shape[0]
logger.info(f"Appended {num_rows} rows to the records")
else:
failed.add(city)
else:
logger.error(f"Could not download {city}")
failed.add(city)
time.sleep(3)
logger.info("Processing concatenated data...")
df = df.drop_duplicates(subset=['date', 'point', 'type']).reset_index(drop=True)
df = df.sort_values(by=['date', 'point', 'type'])
df = df.reset_index(drop=True)
df = df[columns]
logger.info("Writing to csv file...")
df.to_csv("polen.csv", index=False)
logger.info("Finished!")
if failed:
cities = ", ".join(failed)
logger.error("Following points failed: {cities}")
# JSON: http http://gestiona.madrid.org/geoserver/wms\?SERVICE\=WMS\&VERSION\=1.1.1\&REQUEST\=GetFeatureInfo\&LAYERS\=SPOL_V_CAPTADORES_GIS\&QUERY_LAYERS\=SPOL_V_CAPTADORES_GIS\&STYLES\=\&BBOX\=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528\&FEATURE_COUNT\=50\&HEIGHT\=493\&WIDTH\=450\&FORMAT\=image%2Fpng\&INFO_FORMAT\=application%2Fjson\&SRS\=EPSG%3A23030\&X\=238\&Y\=325
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment