campagnola/flu_vs_covid19.py

## flu_vs_covid19.py
"""
A week-by-week comparison of deaths caused by influenza and COVID-19 un the United States.

CDC influenza / pneumonia data:
https://www.cdc.gov/flu/weekly/#S2

COVID19 data:
http://www.healthdata.org/covid/data-downloads
"""

import io
import zipfile
import urllib.request
from datetime import datetime
import numpy as np
import pyqtgraph as pg
import pandas

# Download flu data from CDC and parse csv file
flu_url = 'https://www.cdc.gov/flu/weekly/weeklyarchives2019-2020/data/NCHSData15.csv'
flu_fh = urllib.request.urlopen(flu_url)
flu = pandas.read_csv(flu_fh)

# Download COVID-19 data from IHME, extract csv from zip file, and parse
cov_url = 'https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip'
cov_fh = urllib.request.urlopen(cov_url)
cov_zf = zipfile.ZipFile(io.BytesIO(cov_fh.read()))
csv_fn = [f for f in cov_zf.namelist() if f.endswith('.csv')][0]
cov_fh = cov_zf.open(csv_fn)
cov = pandas.read_csv(cov_fh)
cov['date'] = pandas.to_datetime(cov['date'])  # convert date string to timestamp
cov['week'] = cov['date'].dt.week              # add a column with the week number

# Shift by 25 weeks so the peak of flu season is in the center of the plot
# (otherwise it's split at opposite ends of the plot)
week_offset = 25
flu['Week'] = ((flu['Week'] + week_offset) % 52) - week_offset
cov['week'] = ((cov['week'] + week_offset) % 52) - week_offset

# Flu data goes back to 2014, so for each week, find the best and worst cases
# across all years
min_flu_deaths = flu.groupby('Week').min()['Influenza Deaths'].iloc[:52]
max_flu_deaths = flu.groupby('Week').max()['Influenza Deaths'].iloc[:52]

# Select US data from covid19 dataset
cov_us = cov[cov['location_name']=='United States of America']

# Covid19 data is tracked per day, so add up death counts for each week
cov_us_deaths_upper = cov_us.groupby('week').sum()['deaths_upper']
cov_us_deaths_lower = cov_us.groupby('week').sum()['deaths_lower']

# Remove future predictions from the plot; only show historical data
cov_us_deaths = cov_us_deaths_upper[cov_us_deaths_upper==cov_us_deaths_lower]

# Initialize plot
pg.setConfigOption('background', 'w')
pg.setConfigOption('foreground', 'k')
plt = pg.plot(labels={'left': 'US Deaths Per Week'})

# Add month labels to bottom axis
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_vals = np.array([datetime(year=2020, month=m, day=1).timetuple().tm_yday / 7 for m in range(1, 13)])
month_vals = ((month_vals + week_offset) % 52) - week_offset
plt.getAxis('bottom').setTicks([list(zip(month_vals, month_names))])

# Plot worst-case flu deaths
c2 = plt.plot(max_flu_deaths.index.to_numpy(), max_flu_deaths.to_numpy(), antialias=True, fillLevel=0, fillBrush=(100, 100, 0, 200), pen='k')

# Plot covid-19 deaths
c1 = plt.plot(cov_us_deaths.index.to_numpy(), cov_us_deaths.to_numpy(), antialias=True, fillLevel=0, fillBrush=(200, 50, 0, 200), pen='k')
	"""
	A week-by-week comparison of deaths caused by influenza and COVID-19 un the United States.

	CDC influenza / pneumonia data:
	https://www.cdc.gov/flu/weekly/#S2

	COVID19 data:
	http://www.healthdata.org/covid/data-downloads
	"""

	import io
	import zipfile
	import urllib.request
	from datetime import datetime
	import numpy as np
	import pyqtgraph as pg
	import pandas

	# Download flu data from CDC and parse csv file
	flu_url = 'https://www.cdc.gov/flu/weekly/weeklyarchives2019-2020/data/NCHSData15.csv'
	flu_fh = urllib.request.urlopen(flu_url)
	flu = pandas.read_csv(flu_fh)

	# Download COVID-19 data from IHME, extract csv from zip file, and parse
	cov_url = 'https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip'
	cov_fh = urllib.request.urlopen(cov_url)
	cov_zf = zipfile.ZipFile(io.BytesIO(cov_fh.read()))
	csv_fn = [f for f in cov_zf.namelist() if f.endswith('.csv')][0]
	cov_fh = cov_zf.open(csv_fn)
	cov = pandas.read_csv(cov_fh)
	cov['date'] = pandas.to_datetime(cov['date']) # convert date string to timestamp
	cov['week'] = cov['date'].dt.week # add a column with the week number

	# Shift by 25 weeks so the peak of flu season is in the center of the plot
	# (otherwise it's split at opposite ends of the plot)
	week_offset = 25
	flu['Week'] = ((flu['Week'] + week_offset) % 52) - week_offset
	cov['week'] = ((cov['week'] + week_offset) % 52) - week_offset

	# Flu data goes back to 2014, so for each week, find the best and worst cases
	# across all years
	min_flu_deaths = flu.groupby('Week').min()['Influenza Deaths'].iloc[:52]
	max_flu_deaths = flu.groupby('Week').max()['Influenza Deaths'].iloc[:52]

	# Select US data from covid19 dataset
	cov_us = cov[cov['location_name']=='United States of America']

	# Covid19 data is tracked per day, so add up death counts for each week
	cov_us_deaths_upper = cov_us.groupby('week').sum()['deaths_upper']
	cov_us_deaths_lower = cov_us.groupby('week').sum()['deaths_lower']

	# Remove future predictions from the plot; only show historical data
	cov_us_deaths = cov_us_deaths_upper[cov_us_deaths_upper==cov_us_deaths_lower]

	# Initialize plot
	pg.setConfigOption('background', 'w')
	pg.setConfigOption('foreground', 'k')
	plt = pg.plot(labels={'left': 'US Deaths Per Week'})

	# Add month labels to bottom axis
	month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
	month_vals = np.array([datetime(year=2020, month=m, day=1).timetuple().tm_yday / 7 for m in range(1, 13)])
	month_vals = ((month_vals + week_offset) % 52) - week_offset
	plt.getAxis('bottom').setTicks([list(zip(month_vals, month_names))])

	# Plot worst-case flu deaths
	c2 = plt.plot(max_flu_deaths.index.to_numpy(), max_flu_deaths.to_numpy(), antialias=True, fillLevel=0, fillBrush=(100, 100, 0, 200), pen='k')

	# Plot covid-19 deaths
	c1 = plt.plot(cov_us_deaths.index.to_numpy(), cov_us_deaths.to_numpy(), antialias=True, fillLevel=0, fillBrush=(200, 50, 0, 200), pen='k')