Skip to content

Instantly share code, notes, and snippets.

@jorendorff
Last active July 22, 2020 18:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorendorff/2cc339cdfe56ad00813b310a5d29f1eb to your computer and use it in GitHub Desktop.
Save jorendorff/2cc339cdfe56ad00813b310a5d29f1eb to your computer and use it in GitHub Desktop.
""" Load usafacts.org dataset on COVID-19 spread per U.S. county over time.
To use this, you need a copy of covid_confirmed_usafacts.csv,
which you can download at
<https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv>.
I got there from here:
<https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/>.
"""
import csv
from datetime import date
import re
def parse_date(s):
"""Parse a date in the form mm/dd/yy."""
m = re.match(r'^(1[0-2]?|[2-9])/([1-9]\d?)/(\d\d)$', s)
if m is None:
raise ValueError(f"invalid date {s!r} (expected mm/dd/yy)")
mm, dd, yy = map(int, m.groups())
try:
return date(2000 + yy, mm, dd)
except ValueError as exc:
raise ValueError(f"invalid date {s!r} ({exc})")
class County:
"""Data for one county."""
def __init__(self, fips, name, state, state_fips, cases_by_date):
self.fips = fips
self.name = name
self.state = state
self.state_fips = state_fips
self.cases_by_date = dict(cases_by_date)
COUNTY_FIPS_KEY = '\ufeffcountyFIPS' # cope with a BOM in the data file
def load():
"""Load county COVID-19 data.
Returns a dictionary that maps FIPS county codes to County objects.
"""
counties = {}
with open("covid_confirmed_usafacts.csv") as f:
for row in csv.DictReader(f):
fips = int(row.pop(COUNTY_FIPS_KEY))
name = row.pop("County Name")
state = row.pop("State")
state_fips = int(row.pop("stateFIPS"))
if fips == 0 and name == "Statewide Unallocated":
# Ignore the data in these rows
continue
if fips in counties:
raise ValueError(f"County {fips} ({name}, {state}) appears more than once in the CSV")
data = [(parse_date(date_str), int(ncases_str))
for date_str, ncases_str in row.items()]
counties[fips] = County(fips, name, state, state_fips, data)
return counties
"""Attempt to reproduce the "Hot 50" counties list by @EthicalSkeptic.
This reproduces the list of counties in
<https://twitter.com/EthicalSkeptic/status/1285297329470398465>.
NOTE: THIS IS NOT AN ENDORSEMENT. The formula is clearly flawed, and I think
@EthicalSkeptic's work generally is so rife with confirmation bias (assuming
good faith) that it should be disregarded. (I feel bad saying so, but I also
don't want my part in this misrepresented, so I have little choice but to
say clearly what I think.)
I did this to see if this one formula produces consistent results.
Conclusion: The output fluctuates greatly from day to day. Noise in the input
is amplified. For example, the "heat" for Los Angeles County, CA on consecutive
days is:
209, 237, 2725, 231, -162, -461, 578, -1550
It flaps between being one of the hottest counties in the country and one of
the coldest. Needless to say, this does not correspond to anything actually
happening in the real world. L.A. County is home to some ten million people;
it is either #1 or #2 in daily new cases every day over that time period.
"""
import dataset
from datetime import date, timedelta
def new_cases(county, d):
"""For the given county, get the number of new cases reported on date d."""
return county.cases_by_date[d] - county.cases_by_date[d - timedelta(days=1)]
def heat(county, d):
"""@EthicalSkeptic defines "heat" as increase in daily new cases from 7 days prior.
The main problem with this is that reporting is uneven, so new_cases
fluctuates quite a bit.
"""
return new_cases(county, d) - new_cases(county, d - timedelta(days=7))
def hot50(counties, d):
"""Given the full dataset, compute the "heat" for every county and sort by heat."""
counties_with_heat = [(c, heat(c, d)) for c in counties.values()]
return sorted(counties_with_heat, key=lambda pair: pair[1], reverse=True)
def main():
counties = dataset.load()
N = 50
# Show the "Hot N" for 8 consecutive days.
for t in range(8):
when = date(2020, 7, 14) + timedelta(days=t)
hot = hot50(counties, when)
print(when)
for i, (c, h) in list(enumerate(hot))[:N]:
print(f"{i + 1:2d}. {h:5d} {c.name}, {c.state}")
# Show L.A. County even if it wasn't in the top N for this day.
TARGET = "Los Angeles County", "CA"
[(i_la, h_la)] = [(i, hh) for i, (hc, hh) in enumerate(hot)
if (hc.name, hc.state) == TARGET]
if i_la >= N:
print("...")
print(f"{i_la + 1:2d}. {h_la:5d} {TARGET[0]}, {TARGET[1]}")
print()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment