Skip to content

Instantly share code, notes, and snippets.

Created March 26, 2020 16:06
Show Gist options
  • Save timarnold/f1292f68f54b364a3d6b25a9c0691703 to your computer and use it in GitHub Desktop.
Save timarnold/f1292f68f54b364a3d6b25a9c0691703 to your computer and use it in GitHub Desktop.
import requests
import json
from html.parser import HTMLParser
from lxml import html, etree
from datetime import timedelta, date
urls = {}
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2020, 3, 19)
end_date = date(2020, 3, 25)
date_strings = []
# Get historical data
for single_date in daterange(start_date, end_date):
urls[single_date] = URL_BASE + single_date.strftime("%Y%m%d230000")
# Get today
date_strings.append(date(2020, 3, 26).strftime("%Y-%m-%d"))
urls[date(2020, 3, 26)] = ""
data = {}
for county in COUNTIES:
data[county] = []
for date, url in urls.items():
# Behave differently for a live page vs Internet Archive
if date.strftime("%Y-%m-%d") == date_strings[-1]:
r = requests.get(url, allow_redirects=True)
r = requests.get(url, allow_redirects=True)
r = requests.get(json.loads(r.content)["archived_snapshots"]["closest"]["url"], allow_redirects=True)
node_count = 0
found_counties = []
root = html.fromstring(r.content)
for c1 in root.iter():
if c1.tag != "script":
if c1.tag == "tbody":
for child in c1.iter():
if child.tag == "td" and child.text != None and child.text:
if node_count == 0 and child.text not in COUNTIES:
if node_count % 3 == 0:
key = child.text.replace(u"\u200b", "")
if key in COUNTIES:
value = int(child.getnext().text.replace(u"\u200b", "")) if child.getnext() is not None else 0
node_count += 1
for county in COUNTIES:
if county not in found_counties:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment