|
"""Quick and dirty script to scrape COVID-19 deaths, recoveries, and cases""" |
|
|
|
import re |
|
import requests |
|
import sys |
|
from bs4 import BeautifulSoup |
|
from io import BytesIO |
|
from PIL import Image, ImageOps |
|
from pytesseract import image_to_string |
|
from sys import argv |
|
from urllib.parse import urljoin |
|
|
|
# Cases & deaths (stupid infographic) |
|
CASES_DEATHS_SOURCE_URL = 'https://www.santepubliquefrance.fr/dossiers/coronavirus-covid-19/coronavirus-chiffres-cles-et-evolution-de-la-covid-19-en-france-et-dans-le-monde' |
|
CASES_REGION = (0, 34, 226, 87) |
|
DEATHS_REGION = (0, 273, 226, 322) |
|
|
|
# Recoveries |
|
RECOVERIES_SOURCE_URL = 'https://geodes.santepubliquefrance.fr/GC_indic.php?lang=fr&indic=rad&dataset=covid_hospit&view=map2&filters=jour=%s' |
|
|
|
def fetch_cases_deaths(): |
|
response = requests.get(CASES_DEATHS_SOURCE_URL) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
return soup |
|
|
|
def find_date(soup, verbose=False): |
|
titles = soup.find_all(class_ = 'content__img-title') |
|
if len(titles) != 1: |
|
print('Found %d titles, expected 1' % (len(titles),), file = sys.stderr) |
|
for title in titles: |
|
print(title.text) |
|
sys.exit(1) |
|
title = titles[0].text |
|
if verbose: |
|
print(title) |
|
date_regex = re.compile('\s(\d+)/(\d+)/(\d+)') |
|
(day, month, year) = re.search(date_regex, title).groups() |
|
return '%s-%s-%s' % (year, month, day) |
|
|
|
def find_image_url(soup): |
|
images = [ image.get('src') for image in soup.find_all('img') ] |
|
infographics = [ image for image in images if 'coronavirus' in image ] |
|
if len(infographics) != 1: |
|
print('Found %d infographics, expected 1' % (len(infographics),), file = sys.stderr) |
|
for image in images: |
|
print(image) |
|
sys.exit(1) |
|
return urljoin(CASES_DEATHS_SOURCE_URL, infographics[0]) |
|
|
|
def fetch_image(url): |
|
response = requests.get(url) |
|
image = Image.open(BytesIO(response.content)) |
|
return image |
|
|
|
def to_bw(image): |
|
THRESHOLD = 220 |
|
image = ImageOps.grayscale(image) |
|
return image.point(lambda p: 0 if p > THRESHOLD else 255) |
|
|
|
def ocr(image, verbose = False): |
|
if verbose: |
|
image.show() |
|
image = to_bw(image) |
|
text = image_to_string(image, config = '--psm 7') |
|
if verbose: |
|
print(text) |
|
text = re.sub(r'\s+', '', text) |
|
if not text.isdigit(): |
|
print('Not numeric:', text, file = sys.stderr) |
|
sys.exit(1) |
|
return int(text) |
|
|
|
def get_ocr_stats(image, verbose = False): |
|
if verbose: |
|
image.show() |
|
|
|
cases_image = image.crop(CASES_REGION) |
|
cases = ocr(cases_image, verbose) |
|
|
|
deaths_image = image.crop(DEATHS_REGION) |
|
deaths = ocr(deaths_image, verbose) |
|
|
|
return { 'cases': cases, 'deaths': deaths } |
|
|
|
def fetch_recoveries(date): |
|
response = requests.get(RECOVERIES_SOURCE_URL % (date,)) |
|
full = response.json() |
|
return full['content']['distribution']['sumth'] |
|
|
|
def main(verbose = False): |
|
soup = fetch_cases_deaths() |
|
date = find_date(soup, verbose) |
|
recoveries = fetch_recoveries(date) |
|
image = fetch_image(find_image_url(soup)) |
|
ocr_stats = get_ocr_stats(image, verbose) |
|
print('%s;%d;%d;%d' % (date, ocr_stats['deaths'], recoveries, ocr_stats['cases'])) |
|
|
|
if __name__ == '__main__': |
|
verbose = any('v' in arg.lower() for arg in argv[1:]) |
|
main(verbose) |