LeopoldTal/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Scraper for official government statistics for COVID-19 in France

Purpose

For no reason I can fathom, the French government publishes daily COVID-19 cases and deaths as an infographic, with no text version. There are also no archives: only the latest update is available.
Recovery statistics are published on a different source, which has a (somewhat wonky) API and data for each day.
This is a quick and (very) dirty script to scrape the latest statistics off these two sites.
Dependencies


tesseract
pytesseract
BeautifulSoup
Pillow
requests

Usage

python govscrape.py [-v]

outputs:

the date of the latest update
total deaths
total "recoveries" (actually patients discharged from hospital)
total cases

separated by semicolons, to match the format used on Wikipedia.

  
## govscrape.py
"""Quick and dirty script to scrape COVID-19 deaths, recoveries, and cases"""

import re
import requests
import sys
from bs4 import BeautifulSoup
from io import BytesIO
from PIL import Image, ImageOps
from pytesseract import image_to_string
from sys import argv
from urllib.parse import urljoin

# Cases & deaths (stupid infographic)
CASES_DEATHS_SOURCE_URL = 'https://www.santepubliquefrance.fr/dossiers/coronavirus-covid-19/coronavirus-chiffres-cles-et-evolution-de-la-covid-19-en-france-et-dans-le-monde'
CASES_REGION = (0, 34, 226, 87)
DEATHS_REGION = (0, 273, 226, 322)

# Recoveries
RECOVERIES_SOURCE_URL = 'https://geodes.santepubliquefrance.fr/GC_indic.php?lang=fr&indic=rad&dataset=covid_hospit&view=map2&filters=jour=%s'

def fetch_cases_deaths():
	response = requests.get(CASES_DEATHS_SOURCE_URL)
	soup = BeautifulSoup(response.text, 'html.parser')
	return soup

def find_date(soup, verbose=False):
	titles = soup.find_all(class_ = 'content__img-title')
	if len(titles) != 1:
		print('Found %d titles, expected 1' % (len(titles),), file = sys.stderr)
		for title in titles:
			print(title.text)
		sys.exit(1)
	title = titles[0].text
	if verbose:
		print(title)
	date_regex = re.compile('\s(\d+)/(\d+)/(\d+)')
	(day, month, year) = re.search(date_regex, title).groups()
	return '%s-%s-%s' % (year, month, day)

def find_image_url(soup):
	images = [ image.get('src') for image in soup.find_all('img') ]
	infographics = [ image for image in images if 'coronavirus' in image ]
	if len(infographics) != 1:
		print('Found %d infographics, expected 1' % (len(infographics),), file = sys.stderr)
		for image in images:
			print(image)
		sys.exit(1)
	return urljoin(CASES_DEATHS_SOURCE_URL, infographics[0])

def fetch_image(url):
	response = requests.get(url)
	image = Image.open(BytesIO(response.content))
	return image

def to_bw(image):
	THRESHOLD = 220
	image = ImageOps.grayscale(image)
	return image.point(lambda p: 0 if p > THRESHOLD else 255)

def ocr(image, verbose = False):
	if verbose:
		image.show()
	image = to_bw(image)
	text = image_to_string(image, config = '--psm 7')
	if verbose:
		print(text)
	text = re.sub(r'\s+', '', text)
	if not text.isdigit():
		print('Not numeric:', text, file = sys.stderr)
		sys.exit(1)
	return int(text)

def get_ocr_stats(image, verbose = False):
	if verbose:
		image.show()

	cases_image = image.crop(CASES_REGION)
	cases = ocr(cases_image, verbose)

	deaths_image = image.crop(DEATHS_REGION)
	deaths = ocr(deaths_image, verbose)

	return { 'cases': cases, 'deaths': deaths }

def fetch_recoveries(date):
	response = requests.get(RECOVERIES_SOURCE_URL % (date,))
	full = response.json()
	return full['content']['distribution']['sumth']

def main(verbose = False):
	soup = fetch_cases_deaths()
	date = find_date(soup, verbose)
	recoveries = fetch_recoveries(date)
	image = fetch_image(find_image_url(soup))
	ocr_stats = get_ocr_stats(image, verbose)
	print('%s;%d;%d;%d' % (date, ocr_stats['deaths'], recoveries, ocr_stats['cases']))

if __name__ == '__main__':
	verbose = any('v' in arg.lower() for arg in argv[1:])
	main(verbose)
	"""Quick and dirty script to scrape COVID-19 deaths, recoveries, and cases"""

	import re
	import requests
	import sys
	from bs4 import BeautifulSoup
	from io import BytesIO
	from PIL import Image, ImageOps
	from pytesseract import image_to_string
	from sys import argv
	from urllib.parse import urljoin

	# Cases & deaths (stupid infographic)
	CASES_DEATHS_SOURCE_URL = 'https://www.santepubliquefrance.fr/dossiers/coronavirus-covid-19/coronavirus-chiffres-cles-et-evolution-de-la-covid-19-en-france-et-dans-le-monde'
	CASES_REGION = (0, 34, 226, 87)
	DEATHS_REGION = (0, 273, 226, 322)

	# Recoveries
	RECOVERIES_SOURCE_URL = 'https://geodes.santepubliquefrance.fr/GC_indic.php?lang=fr&indic=rad&dataset=covid_hospit&view=map2&filters=jour=%s'

	def fetch_cases_deaths():
	response = requests.get(CASES_DEATHS_SOURCE_URL)
	soup = BeautifulSoup(response.text, 'html.parser')
	return soup

	def find_date(soup, verbose=False):
	titles = soup.find_all(class_ = 'content__img-title')
	if len(titles) != 1:
	print('Found %d titles, expected 1' % (len(titles),), file = sys.stderr)
	for title in titles:
	print(title.text)
	sys.exit(1)
	title = titles[0].text
	if verbose:
	print(title)
	date_regex = re.compile('\s(\d+)/(\d+)/(\d+)')
	(day, month, year) = re.search(date_regex, title).groups()
	return '%s-%s-%s' % (year, month, day)

	def find_image_url(soup):
	images = [ image.get('src') for image in soup.find_all('img') ]
	infographics = [ image for image in images if 'coronavirus' in image ]
	if len(infographics) != 1:
	print('Found %d infographics, expected 1' % (len(infographics),), file = sys.stderr)
	for image in images:
	print(image)
	sys.exit(1)
	return urljoin(CASES_DEATHS_SOURCE_URL, infographics[0])

	def fetch_image(url):
	response = requests.get(url)
	image = Image.open(BytesIO(response.content))
	return image

	def to_bw(image):
	THRESHOLD = 220
	image = ImageOps.grayscale(image)
	return image.point(lambda p: 0 if p > THRESHOLD else 255)

	def ocr(image, verbose = False):
	if verbose:
	image.show()
	image = to_bw(image)
	text = image_to_string(image, config = '--psm 7')
	if verbose:
	print(text)
	text = re.sub(r'\s+', '', text)
	if not text.isdigit():
	print('Not numeric:', text, file = sys.stderr)
	sys.exit(1)
	return int(text)

	def get_ocr_stats(image, verbose = False):
	if verbose:
	image.show()

	cases_image = image.crop(CASES_REGION)
	cases = ocr(cases_image, verbose)

	deaths_image = image.crop(DEATHS_REGION)
	deaths = ocr(deaths_image, verbose)

	return { 'cases': cases, 'deaths': deaths }

	def fetch_recoveries(date):
	response = requests.get(RECOVERIES_SOURCE_URL % (date,))
	full = response.json()
	return full['content']['distribution']['sumth']

	def main(verbose = False):
	soup = fetch_cases_deaths()
	date = find_date(soup, verbose)
	recoveries = fetch_recoveries(date)
	image = fetch_image(find_image_url(soup))
	ocr_stats = get_ocr_stats(image, verbose)
	print('%s;%d;%d;%d' % (date, ocr_stats['deaths'], recoveries, ocr_stats['cases']))

	if __name__ == '__main__':
	verbose = any('v' in arg.lower() for arg in argv[1:])
	main(verbose)