Skip to content

Instantly share code, notes, and snippets.

@timarnold
Created March 26, 2020 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timarnold/f1292f68f54b364a3d6b25a9c0691703 to your computer and use it in GitHub Desktop.
Save timarnold/f1292f68f54b364a3d6b25a9c0691703 to your computer and use it in GitHub Desktop.
import requests
import json
from html.parser import HTMLParser
from lxml import html, etree
from datetime import timedelta, date
URL_BASE = "https://archive.org/wayback/available?url=https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx&timestamp="
COUNTIES = [
"Adams",
"Allegheny",
"Armstrong",
"Beaver",
"Bedford",
"Berks",
"Blair",
"Bradford",
"Bucks",
"Butler",
"Cambria",
"Cameron",
"Carbon",
"Centre",
"Chester",
"Clarion",
"Clearfield",
"Clinton",
"Columbia",
"Crawford",
"Cumberland",
"Dauphin",
"Delaware",
"Elk",
"Erie",
"Fayette",
"Forest",
"Franklin",
"Fulton",
"Greene",
"Huntingdon",
"Indiana",
"Jefferson",
"Juniata",
"Lackawanna",
"Lancaster",
"Lawrence",
"Lebanon",
"Lehigh",
"Luzerne",
"Lycoming",
"McKean",
"Mercer",
"Mifflin",
"Monroe",
"Montgomery",
"Montour",
"Northampton",
"Northumberland",
"Perry",
"Philadelphia",
"Pike",
"Potter",
"Schuylkill",
"Snyder",
"Somerset",
"Sullivan",
"Susquehanna",
"Tioga",
"Union",
"Venango",
"Warren",
"Washington",
"Wayne",
"Westmoreland",
"Wyoming",
"York",
]
urls = {}
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2020, 3, 19)
end_date = date(2020, 3, 25)
date_strings = []
# Get historical data
for single_date in daterange(start_date, end_date):
date_strings.append(single_date.strftime("%Y-%m-%d"))
urls[single_date] = URL_BASE + single_date.strftime("%Y%m%d230000")
# Get today
date_strings.append(date(2020, 3, 26).strftime("%Y-%m-%d"))
urls[date(2020, 3, 26)] = "https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx"
data = {}
for county in COUNTIES:
data[county] = []
for date, url in urls.items():
# Behave differently for a live page vs Internet Archive
if date.strftime("%Y-%m-%d") == date_strings[-1]:
r = requests.get(url, allow_redirects=True)
else:
r = requests.get(url, allow_redirects=True)
r = requests.get(json.loads(r.content)["archived_snapshots"]["closest"]["url"], allow_redirects=True)
node_count = 0
found_counties = []
root = html.fromstring(r.content)
for c1 in root.iter():
if c1.tag != "script":
if c1.tag == "tbody":
for child in c1.iter():
if child.tag == "td" and child.text != None and child.text:
if node_count == 0 and child.text not in COUNTIES:
continue
if node_count % 3 == 0:
key = child.text.replace(u"\u200b", "")
if key in COUNTIES:
value = int(child.getnext().text.replace(u"\u200b", "")) if child.getnext() is not None else 0
data[key].append(value)
found_counties.append(key)
node_count += 1
for county in COUNTIES:
if county not in found_counties:
data[county].append(0)
print(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment