timarnold/fetch_coronavirus_pa.py

## fetch_coronavirus_pa.py
import requests
import json
from html.parser import HTMLParser
from lxml import html, etree
from datetime import timedelta, date

URL_BASE = "https://archive.org/wayback/available?url=https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx&timestamp="
COUNTIES = [
"Adams",
"Allegheny",
"Armstrong",
"Beaver",
"Bedford",
"Berks",
"Blair",
"Bradford",
"Bucks",
"Butler",
"Cambria",
"Cameron",
"Carbon",
"Centre",
"Chester",
"Clarion",
"Clearfield",
"Clinton",
"Columbia",
"Crawford",
"Cumberland",
"Dauphin",
"Delaware",
"Elk",
"Erie",
"Fayette",
"Forest",
"Franklin",
"Fulton",
"Greene",
"Huntingdon",
"Indiana",
"Jefferson",
"Juniata",
"Lackawanna",
"Lancaster",
"Lawrence",
"Lebanon",
"Lehigh",
"Luzerne",
"Lycoming",
"McKean",
"Mercer",
"Mifflin",
"Monroe",
"Montgomery",
"Montour",
"Northampton",
"Northumberland",
"Perry",
"Philadelphia",
"Pike",
"Potter",
"Schuylkill",
"Snyder",
"Somerset",
"Sullivan",
"Susquehanna",
"Tioga",
"Union",
"Venango",
"Warren",
"Washington",
"Wayne",
"Westmoreland",
"Wyoming",
"York",
]

urls = {}

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = date(2020, 3, 19)
end_date = date(2020, 3, 25)

date_strings = []

# Get historical data
for single_date in daterange(start_date, end_date):
  date_strings.append(single_date.strftime("%Y-%m-%d"))
  urls[single_date] = URL_BASE + single_date.strftime("%Y%m%d230000")

# Get today
date_strings.append(date(2020, 3, 26).strftime("%Y-%m-%d"))
urls[date(2020, 3, 26)] = "https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx"

data = {}
for county in COUNTIES:
  data[county] = []

for date, url in urls.items():
  # Behave differently for a live page vs Internet Archive
  if date.strftime("%Y-%m-%d") == date_strings[-1]:
    r = requests.get(url, allow_redirects=True)
  else:
    r = requests.get(url, allow_redirects=True)
    r = requests.get(json.loads(r.content)["archived_snapshots"]["closest"]["url"], allow_redirects=True)

  node_count = 0
  found_counties = []
  root = html.fromstring(r.content)
  for c1 in root.iter():
    if c1.tag != "script":
      if c1.tag == "tbody":
        for child in c1.iter():
          if child.tag == "td" and child.text != None and child.text:

            if node_count == 0 and child.text not in COUNTIES:
              continue
            if node_count % 3 == 0:
              key = child.text.replace(u"\u200b", "")
              if key in COUNTIES:
                value = int(child.getnext().text.replace(u"\u200b", "")) if child.getnext() is not None else 0
                data[key].append(value)
                found_counties.append(key)
            node_count += 1
  for county in COUNTIES:
    if county not in found_counties:
      data[county].append(0)

print(data)
	import requests
	import json
	from html.parser import HTMLParser
	from lxml import html, etree
	from datetime import timedelta, date

	URL_BASE = "https://archive.org/wayback/available?url=https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx&timestamp="
	COUNTIES = [
	"Adams",
	"Allegheny",
	"Armstrong",
	"Beaver",
	"Bedford",
	"Berks",
	"Blair",
	"Bradford",
	"Bucks",
	"Butler",
	"Cambria",
	"Cameron",
	"Carbon",
	"Centre",
	"Chester",
	"Clarion",
	"Clearfield",
	"Clinton",
	"Columbia",
	"Crawford",
	"Cumberland",
	"Dauphin",
	"Delaware",
	"Elk",
	"Erie",
	"Fayette",
	"Forest",
	"Franklin",
	"Fulton",
	"Greene",
	"Huntingdon",
	"Indiana",
	"Jefferson",
	"Juniata",
	"Lackawanna",
	"Lancaster",
	"Lawrence",
	"Lebanon",
	"Lehigh",
	"Luzerne",
	"Lycoming",
	"McKean",
	"Mercer",
	"Mifflin",
	"Monroe",
	"Montgomery",
	"Montour",
	"Northampton",
	"Northumberland",
	"Perry",
	"Philadelphia",
	"Pike",
	"Potter",
	"Schuylkill",
	"Snyder",
	"Somerset",
	"Sullivan",
	"Susquehanna",
	"Tioga",
	"Union",
	"Venango",
	"Warren",
	"Washington",
	"Wayne",
	"Westmoreland",
	"Wyoming",
	"York",
	]

	urls = {}

	def daterange(start_date, end_date):
	for n in range(int ((end_date - start_date).days)):
	yield start_date + timedelta(n)

	start_date = date(2020, 3, 19)
	end_date = date(2020, 3, 25)

	date_strings = []

	# Get historical data
	for single_date in daterange(start_date, end_date):
	date_strings.append(single_date.strftime("%Y-%m-%d"))
	urls[single_date] = URL_BASE + single_date.strftime("%Y%m%d230000")

	# Get today
	date_strings.append(date(2020, 3, 26).strftime("%Y-%m-%d"))
	urls[date(2020, 3, 26)] = "https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx"

	data = {}
	for county in COUNTIES:
	data[county] = []

	for date, url in urls.items():
	# Behave differently for a live page vs Internet Archive
	if date.strftime("%Y-%m-%d") == date_strings[-1]:
	r = requests.get(url, allow_redirects=True)
	else:
	r = requests.get(url, allow_redirects=True)
	r = requests.get(json.loads(r.content)["archived_snapshots"]["closest"]["url"], allow_redirects=True)

	node_count = 0
	found_counties = []
	root = html.fromstring(r.content)
	for c1 in root.iter():
	if c1.tag != "script":
	if c1.tag == "tbody":
	for child in c1.iter():
	if child.tag == "td" and child.text != None and child.text:

	if node_count == 0 and child.text not in COUNTIES:
	continue
	if node_count % 3 == 0:
	key = child.text.replace(u"\u200b", "")
	if key in COUNTIES:
	value = int(child.getnext().text.replace(u"\u200b", "")) if child.getnext() is not None else 0
	data[key].append(value)
	found_counties.append(key)
	node_count += 1
	for county in COUNTIES:
	if county not in found_counties:
	data[county].append(0)

	print(data)