Skip to content

Instantly share code, notes, and snippets.

@LeopoldTal
Last active August 21, 2020 07:02
Show Gist options
  • Save LeopoldTal/ff64601de455749a9d457b098a3da01f to your computer and use it in GitHub Desktop.
Save LeopoldTal/ff64601de455749a9d457b098a3da01f to your computer and use it in GitHub Desktop.
Scraper for official government statistics for COVID-19 in France

Scraper for official government statistics for COVID-19 in France

Purpose

For no reason I can fathom, the French government publishes daily COVID-19 cases and deaths as an infographic, with no text version. There are also no archives: only the latest update is available.

Recovery statistics are published on a different source, which has a (somewhat wonky) API and data for each day.

This is a quick and (very) dirty script to scrape the latest statistics off these two sites.

Dependencies

Usage

python govscrape.py [-v]

outputs:

  • the date of the latest update
  • total deaths
  • total "recoveries" (actually patients discharged from hospital)
  • total cases

separated by semicolons, to match the format used on Wikipedia.

"""Quick and dirty script to scrape COVID-19 deaths, recoveries, and cases"""
import re
import requests
import sys
from bs4 import BeautifulSoup
from io import BytesIO
from PIL import Image, ImageOps
from pytesseract import image_to_string
from sys import argv
from urllib.parse import urljoin
# Cases & deaths (stupid infographic)
CASES_DEATHS_SOURCE_URL = 'https://www.santepubliquefrance.fr/dossiers/coronavirus-covid-19/coronavirus-chiffres-cles-et-evolution-de-la-covid-19-en-france-et-dans-le-monde'
CASES_REGION = (0, 34, 226, 87)
DEATHS_REGION = (0, 273, 226, 322)
# Recoveries
RECOVERIES_SOURCE_URL = 'https://geodes.santepubliquefrance.fr/GC_indic.php?lang=fr&indic=rad&dataset=covid_hospit&view=map2&filters=jour=%s'
def fetch_cases_deaths():
response = requests.get(CASES_DEATHS_SOURCE_URL)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def find_date(soup, verbose=False):
titles = soup.find_all(class_ = 'content__img-title')
if len(titles) != 1:
print('Found %d titles, expected 1' % (len(titles),), file = sys.stderr)
for title in titles:
print(title.text)
sys.exit(1)
title = titles[0].text
if verbose:
print(title)
date_regex = re.compile('\s(\d+)/(\d+)/(\d+)')
(day, month, year) = re.search(date_regex, title).groups()
return '%s-%s-%s' % (year, month, day)
def find_image_url(soup):
images = [ image.get('src') for image in soup.find_all('img') ]
infographics = [ image for image in images if 'coronavirus' in image ]
if len(infographics) != 1:
print('Found %d infographics, expected 1' % (len(infographics),), file = sys.stderr)
for image in images:
print(image)
sys.exit(1)
return urljoin(CASES_DEATHS_SOURCE_URL, infographics[0])
def fetch_image(url):
response = requests.get(url)
image = Image.open(BytesIO(response.content))
return image
def to_bw(image):
THRESHOLD = 220
image = ImageOps.grayscale(image)
return image.point(lambda p: 0 if p > THRESHOLD else 255)
def ocr(image, verbose = False):
if verbose:
image.show()
image = to_bw(image)
text = image_to_string(image, config = '--psm 7')
if verbose:
print(text)
text = re.sub(r'\s+', '', text)
if not text.isdigit():
print('Not numeric:', text, file = sys.stderr)
sys.exit(1)
return int(text)
def get_ocr_stats(image, verbose = False):
if verbose:
image.show()
cases_image = image.crop(CASES_REGION)
cases = ocr(cases_image, verbose)
deaths_image = image.crop(DEATHS_REGION)
deaths = ocr(deaths_image, verbose)
return { 'cases': cases, 'deaths': deaths }
def fetch_recoveries(date):
response = requests.get(RECOVERIES_SOURCE_URL % (date,))
full = response.json()
return full['content']['distribution']['sumth']
def main(verbose = False):
soup = fetch_cases_deaths()
date = find_date(soup, verbose)
recoveries = fetch_recoveries(date)
image = fetch_image(find_image_url(soup))
ocr_stats = get_ocr_stats(image, verbose)
print('%s;%d;%d;%d' % (date, ocr_stats['deaths'], recoveries, ocr_stats['cases']))
if __name__ == '__main__':
verbose = any('v' in arg.lower() for arg in argv[1:])
main(verbose)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment