aladagemre/polen_crawler.py

## polen_crawler.py
"""
This is a script for parsing polen levels in Madrid, Spain.
It used to work in 2019 but you may need to make fixes to make it work in the following years.
"""
import re
import locale
import time
import logging

import requests
import pandas as pd

from bs4 import BeautifulSoup
import coloredlogs, logging
coloredlogs.install()

logger = logging.getLogger("Polen Crawler")

locale.setlocale(locale.LC_TIME, "es_ES")

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield tuple(l[i:i + n])

def format_name(name):
    return name.replace('Ã¡', 'á').replace('Ã', 'í')

def process_chunk(chunk):
    type_ = format_name(chunk[0].strip())
    amount = chunk[1].strip()
    amount = re.sub(r'(?<=\d)[,\.](?=\d)','', amount)
    level, threshold = chunk[2].strip().split(" (")
    threshold = threshold.strip()[:-1]
    return (type_, amount, level, threshold)

cities = {
    'Madrid-Arganzuela': 'X=262&Y=299',
    'Ciudad Universiteria': 'X=249&Y=282',
    'Las Rozas': 'X=199&Y=252',
    'Collado Villalba': 'X=171&Y=203',
    'Alcobendas': 'X=282&Y=242',
    'Salamanca': 'X=264&Y=279',
    'Coslada': 'X=304&Y=283',
    'Alcala de Henares': 'X=357&Y=262',
    'Getafe': 'X=252&Y=332',
    'Aranjuez': 'X=290&Y=441'
}

base = 'http://gestiona.madrid.org/geoserver/wms?SERVICE=WMS&VERSION=1.1.1&REQUEST=GetFeatureInfo&LAYERS=SPOL_V_CAPTADORES_GIS&QUERY_LAYERS=SPOL_V_CAPTADORES_GIS&STYLES=&BBOX=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528&FEATURE_COUNT=50&HEIGHT=493&WIDTH=450&FORMAT=image%2Fpng&INFO_FORMAT=text%2Fhtml&SRS=EPSG%3A23030&'

columns = ['date', 'point', 'type', 'amount', 'level', 'threshold']
try:
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
    df = pd.read_csv("polen.csv", parse_dates=['date'], date_parser=dateparse)
except FileNotFoundError as ex:
    df = pd.DataFrame(columns=columns)

failed = set()

for city, arg in cities.items():
    logger.info(f"Fetching {city}...")
    html_doc = requests.get(base + arg).text
    logger.info(f"Parsing {city}...")
    soup = BeautifulSoup(html_doc, 'html.parser')
    results = [label.get_text() for label in soup.find_all('label')]
    if results:
        point = results[1].strip()
        date = results[3].strip()
        new_rows = pd.DataFrame([[date, point, *process_chunk(chunk)] for chunk in chunks(results[7::], 3)], columns=columns)
        new_rows.date = pd.to_datetime(new_rows.date, format='%d-%b-%Y')
        if not new_rows.empty:
            df = pd.concat([df, new_rows], sort=True)
            num_rows = new_rows.shape[0]
            logger.info(f"Appended {num_rows} rows to the records")
        else:
            failed.add(city)
    else:
        logger.error(f"Could not download {city}")
        failed.add(city)

    time.sleep(3)

logger.info("Processing concatenated data...")
df = df.drop_duplicates(subset=['date', 'point', 'type']).reset_index(drop=True)
df = df.sort_values(by=['date', 'point', 'type'])
df = df.reset_index(drop=True)
df = df[columns]
logger.info("Writing to csv file...")
df.to_csv("polen.csv", index=False)
logger.info("Finished!")
if failed:
    cities = ", ".join(failed)
    logger.error("Following points failed: {cities}")


# JSON: http http://gestiona.madrid.org/geoserver/wms\?SERVICE\=WMS\&VERSION\=1.1.1\&REQUEST\=GetFeatureInfo\&LAYERS\=SPOL_V_CAPTADORES_GIS\&QUERY_LAYERS\=SPOL_V_CAPTADORES_GIS\&STYLES\=\&BBOX\=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528\&FEATURE_COUNT\=50\&HEIGHT\=493\&WIDTH\=450\&FORMAT\=image%2Fpng\&INFO_FORMAT\=application%2Fjson\&SRS\=EPSG%3A23030\&X\=238\&Y\=325
	"""
	This is a script for parsing polen levels in Madrid, Spain.
	It used to work in 2019 but you may need to make fixes to make it work in the following years.
	"""
	import re
	import locale
	import time
	import logging

	import requests
	import pandas as pd

	from bs4 import BeautifulSoup
	import coloredlogs, logging
	coloredlogs.install()

	logger = logging.getLogger("Polen Crawler")

	locale.setlocale(locale.LC_TIME, "es_ES")

	def chunks(l, n):
	"""Yield successive n-sized chunks from l."""
	for i in range(0, len(l), n):
	yield tuple(l[i:i + n])

	def format_name(name):
	return name.replace('Ã¡', 'á').replace('Ã', 'í')

	def process_chunk(chunk):
	type_ = format_name(chunk[0].strip())
	amount = chunk[1].strip()
	amount = re.sub(r'(?<=\d)[,\.](?=\d)','', amount)
	level, threshold = chunk[2].strip().split(" (")
	threshold = threshold.strip()[:-1]
	return (type_, amount, level, threshold)

	cities = {
	'Madrid-Arganzuela': 'X=262&Y=299',
	'Ciudad Universiteria': 'X=249&Y=282',
	'Las Rozas': 'X=199&Y=252',
	'Collado Villalba': 'X=171&Y=203',
	'Alcobendas': 'X=282&Y=242',
	'Salamanca': 'X=264&Y=279',
	'Coslada': 'X=304&Y=283',
	'Alcala de Henares': 'X=357&Y=262',
	'Getafe': 'X=252&Y=332',
	'Aranjuez': 'X=290&Y=441'
	}

	base = 'http://gestiona.madrid.org/geoserver/wms?SERVICE=WMS&VERSION=1.1.1&REQUEST=GetFeatureInfo&LAYERS=SPOL_V_CAPTADORES_GIS&QUERY_LAYERS=SPOL_V_CAPTADORES_GIS&STYLES=&BBOX=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528&FEATURE_COUNT=50&HEIGHT=493&WIDTH=450&FORMAT=image%2Fpng&INFO_FORMAT=text%2Fhtml&SRS=EPSG%3A23030&'

	columns = ['date', 'point', 'type', 'amount', 'level', 'threshold']
	try:
	dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
	df = pd.read_csv("polen.csv", parse_dates=['date'], date_parser=dateparse)
	except FileNotFoundError as ex:
	df = pd.DataFrame(columns=columns)

	failed = set()

	for city, arg in cities.items():
	logger.info(f"Fetching {city}...")
	html_doc = requests.get(base + arg).text
	logger.info(f"Parsing {city}...")
	soup = BeautifulSoup(html_doc, 'html.parser')
	results = [label.get_text() for label in soup.find_all('label')]
	if results:
	point = results[1].strip()
	date = results[3].strip()
	new_rows = pd.DataFrame([[date, point, *process_chunk(chunk)] for chunk in chunks(results[7::], 3)], columns=columns)
	new_rows.date = pd.to_datetime(new_rows.date, format='%d-%b-%Y')
	if not new_rows.empty:
	df = pd.concat([df, new_rows], sort=True)
	num_rows = new_rows.shape[0]
	logger.info(f"Appended {num_rows} rows to the records")
	else:
	failed.add(city)
	else:
	logger.error(f"Could not download {city}")
	failed.add(city)

	time.sleep(3)

	logger.info("Processing concatenated data...")
	df = df.drop_duplicates(subset=['date', 'point', 'type']).reset_index(drop=True)
	df = df.sort_values(by=['date', 'point', 'type'])
	df = df.reset_index(drop=True)
	df = df[columns]
	logger.info("Writing to csv file...")
	df.to_csv("polen.csv", index=False)
	logger.info("Finished!")
	if failed:
	cities = ", ".join(failed)
	logger.error("Following points failed: {cities}")



	# JSON: http http://gestiona.madrid.org/geoserver/wms\?SERVICE\=WMS\&VERSION\=1.1.1\&REQUEST\=GetFeatureInfo\&LAYERS\=SPOL_V_CAPTADORES_GIS\&QUERY_LAYERS\=SPOL_V_CAPTADORES_GIS\&STYLES\=\&BBOX\=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528\&FEATURE_COUNT\=50\&HEIGHT\=493\&WIDTH\=450\&FORMAT\=image%2Fpng\&INFO_FORMAT\=application%2Fjson\&SRS\=EPSG%3A23030\&X\=238\&Y\=325