jorendorff/dataset.py

## dataset.py
""" Load usafacts.org dataset on COVID-19 spread per U.S. county over time.

To use this, you need a copy of covid_confirmed_usafacts.csv,
which you can download at
<https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv>.

I got there from here:
<https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/>.
"""


import csv
from datetime import date
import re


def parse_date(s):
    """Parse a date in the form mm/dd/yy."""
    m = re.match(r'^(1[0-2]?|[2-9])/([1-9]\d?)/(\d\d)$', s)
    if m is None:
        raise ValueError(f"invalid date {s!r} (expected mm/dd/yy)")
    mm, dd, yy = map(int, m.groups())
    try:
        return date(2000 + yy, mm, dd)
    except ValueError as exc:
        raise ValueError(f"invalid date {s!r} ({exc})")


class County:
    """Data for one county."""
    def __init__(self, fips, name, state, state_fips, cases_by_date):
        self.fips = fips
        self.name = name
        self.state = state
        self.state_fips = state_fips
        self.cases_by_date = dict(cases_by_date)


COUNTY_FIPS_KEY = '\ufeffcountyFIPS'  # cope with a BOM in the data file


def load():
    """Load county COVID-19 data.

    Returns a dictionary that maps FIPS county codes to County objects.
    """
    counties = {}
    with open("covid_confirmed_usafacts.csv") as f:
        for row in csv.DictReader(f):
            fips = int(row.pop(COUNTY_FIPS_KEY))
            name = row.pop("County Name")
            state = row.pop("State")
            state_fips = int(row.pop("stateFIPS"))
            if fips == 0 and name == "Statewide Unallocated":
                # Ignore the data in these rows
                continue
            if fips in counties:
                raise ValueError(f"County {fips} ({name}, {state}) appears more than once in the CSV")
            data = [(parse_date(date_str), int(ncases_str))
                    for date_str, ncases_str in row.items()]
            counties[fips] = County(fips, name, state, state_fips, data)
    return counties


## hot50.py
"""Attempt to reproduce the "Hot 50" counties list by @EthicalSkeptic.

This reproduces the list of counties in
<https://twitter.com/EthicalSkeptic/status/1285297329470398465>.

NOTE: THIS IS NOT AN ENDORSEMENT. The formula is clearly flawed, and I think
@EthicalSkeptic's work generally is so rife with confirmation bias (assuming
good faith) that it should be disregarded. (I feel bad saying so, but I also
don't want my part in this misrepresented, so I have little choice but to
say clearly what I think.)

I did this to see if this one formula produces consistent results.

Conclusion: The output fluctuates greatly from day to day. Noise in the input
is amplified. For example, the "heat" for Los Angeles County, CA on consecutive
days is:
    209, 237, 2725, 231, -162, -461, 578, -1550
It flaps between being one of the hottest counties in the country and one of
the coldest. Needless to say, this does not correspond to anything actually
happening in the real world. L.A. County is home to some ten million people;
it is either #1 or #2 in daily new cases every day over that time period.
"""

import dataset
from datetime import date, timedelta

def new_cases(county, d):
    """For the given county, get the number of new cases reported on date d."""
    return county.cases_by_date[d] - county.cases_by_date[d - timedelta(days=1)]

def heat(county, d):
    """@EthicalSkeptic defines "heat" as increase in daily new cases from 7 days prior.

    The main problem with this is that reporting is uneven, so new_cases
    fluctuates quite a bit.
    """
    return new_cases(county, d) - new_cases(county, d - timedelta(days=7))

def hot50(counties, d):
    """Given the full dataset, compute the "heat" for every county and sort by heat."""
    counties_with_heat = [(c, heat(c, d)) for c in counties.values()]
    return sorted(counties_with_heat, key=lambda pair: pair[1], reverse=True)

def main():
    counties = dataset.load()

    N = 50

    # Show the "Hot N" for 8 consecutive days.
    for t in range(8):
        when = date(2020, 7, 14) + timedelta(days=t)
        hot = hot50(counties, when)
        print(when)
        for i, (c, h) in list(enumerate(hot))[:N]:
            print(f"{i + 1:2d}. {h:5d} {c.name}, {c.state}")

        # Show L.A. County even if it wasn't in the top N for this day.
        TARGET = "Los Angeles County", "CA"
        [(i_la, h_la)] = [(i, hh) for i, (hc, hh) in enumerate(hot)
                          if (hc.name, hc.state) == TARGET]
        if i_la >= N:
            print("...")
            print(f"{i_la + 1:2d}. {h_la:5d} {TARGET[0]}, {TARGET[1]}")
        print()


main()
	""" Load usafacts.org dataset on COVID-19 spread per U.S. county over time.

	To use this, you need a copy of covid_confirmed_usafacts.csv,
	which you can download at
	<https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv>.

	I got there from here:
	<https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/>.
	"""


	import csv
	from datetime import date
	import re


	def parse_date(s):
	"""Parse a date in the form mm/dd/yy."""
	m = re.match(r'^(1[0-2]?\|[2-9])/([1-9]\d?)/(\d\d)$', s)
	if m is None:
	raise ValueError(f"invalid date {s!r} (expected mm/dd/yy)")
	mm, dd, yy = map(int, m.groups())
	try:
	return date(2000 + yy, mm, dd)
	except ValueError as exc:
	raise ValueError(f"invalid date {s!r} ({exc})")


	class County:
	"""Data for one county."""
	def __init__(self, fips, name, state, state_fips, cases_by_date):
	self.fips = fips
	self.name = name
	self.state = state
	self.state_fips = state_fips
	self.cases_by_date = dict(cases_by_date)


	COUNTY_FIPS_KEY = '\ufeffcountyFIPS' # cope with a BOM in the data file


	def load():
	"""Load county COVID-19 data.

	Returns a dictionary that maps FIPS county codes to County objects.
	"""
	counties = {}
	with open("covid_confirmed_usafacts.csv") as f:
	for row in csv.DictReader(f):
	fips = int(row.pop(COUNTY_FIPS_KEY))
	name = row.pop("County Name")
	state = row.pop("State")
	state_fips = int(row.pop("stateFIPS"))
	if fips == 0 and name == "Statewide Unallocated":
	# Ignore the data in these rows
	continue
	if fips in counties:
	raise ValueError(f"County {fips} ({name}, {state}) appears more than once in the CSV")
	data = [(parse_date(date_str), int(ncases_str))
	for date_str, ncases_str in row.items()]
	counties[fips] = County(fips, name, state, state_fips, data)
	return counties
	"""Attempt to reproduce the "Hot 50" counties list by @EthicalSkeptic.

	This reproduces the list of counties in
	<https://twitter.com/EthicalSkeptic/status/1285297329470398465>.

	NOTE: THIS IS NOT AN ENDORSEMENT. The formula is clearly flawed, and I think
	@EthicalSkeptic's work generally is so rife with confirmation bias (assuming
	good faith) that it should be disregarded. (I feel bad saying so, but I also
	don't want my part in this misrepresented, so I have little choice but to
	say clearly what I think.)

	I did this to see if this one formula produces consistent results.

	Conclusion: The output fluctuates greatly from day to day. Noise in the input
	is amplified. For example, the "heat" for Los Angeles County, CA on consecutive
	days is:
	209, 237, 2725, 231, -162, -461, 578, -1550
	It flaps between being one of the hottest counties in the country and one of
	the coldest. Needless to say, this does not correspond to anything actually
	happening in the real world. L.A. County is home to some ten million people;
	it is either #1 or #2 in daily new cases every day over that time period.
	"""

	import dataset
	from datetime import date, timedelta

	def new_cases(county, d):
	"""For the given county, get the number of new cases reported on date d."""
	return county.cases_by_date[d] - county.cases_by_date[d - timedelta(days=1)]

	def heat(county, d):
	"""@EthicalSkeptic defines "heat" as increase in daily new cases from 7 days prior.

	The main problem with this is that reporting is uneven, so new_cases
	fluctuates quite a bit.
	"""
	return new_cases(county, d) - new_cases(county, d - timedelta(days=7))

	def hot50(counties, d):
	"""Given the full dataset, compute the "heat" for every county and sort by heat."""
	counties_with_heat = [(c, heat(c, d)) for c in counties.values()]
	return sorted(counties_with_heat, key=lambda pair: pair[1], reverse=True)

	def main():
	counties = dataset.load()

	N = 50

	# Show the "Hot N" for 8 consecutive days.
	for t in range(8):
	when = date(2020, 7, 14) + timedelta(days=t)
	hot = hot50(counties, when)
	print(when)
	for i, (c, h) in list(enumerate(hot))[:N]:
	print(f"{i + 1:2d}. {h:5d} {c.name}, {c.state}")

	# Show L.A. County even if it wasn't in the top N for this day.
	TARGET = "Los Angeles County", "CA"
	[(i_la, h_la)] = [(i, hh) for i, (hc, hh) in enumerate(hot)
	if (hc.name, hc.state) == TARGET]
	if i_la >= N:
	print("...")
	print(f"{i_la + 1:2d}. {h_la:5d} {TARGET[0]}, {TARGET[1]}")
	print()


	main()