Skip to content

Instantly share code, notes, and snippets.

@simeonmiteff
Created March 23, 2020 19:48
Show Gist options
  • Save simeonmiteff/7a2100fb48e4752591c9795b9ef48c1c to your computer and use it in GitHub Desktop.
Save simeonmiteff/7a2100fb48e4752591c9795b9ef48c1c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import urllib.request
def scrape_update(url):
with urllib.request.urlopen(url) as response:
html_doc = response.read()
soup = BeautifulSoup(html_doc, 'html.parser')
# Not sure how stable indexing into the divs will be
div = soup.find_all(class_="elementor-widget-container")[9]
province = None
for c in div.children:
if c.name == 'p':
s = str(c.contents).lower()
if len(s)<=50 and s.find('province') != -1:
province = list(c.children)[0].contents[0].strip()
if province and c.name == 'ul':
for c in c.children:
if c.name == 'li':
case_string = c.contents[0]
yield {'province':province,'case_string':case_string}
if __name__ == "__main__":
# FIXME: This does not deal with pagination
with urllib.request.urlopen('http://www.nicd.ac.za/media/alerts/') as response:
html_doc = response.read()
soup = BeautifulSoup(html_doc, 'html.parser')
articles = soup.find_all('article')
for a in articles:
link = a.find(class_="elementor-post__thumbnail__link")
href = link.get('href')
if not href: continue
if href.find('covid-19-update')==-1: continue
img = link.find('img')
src = img.get('src')
if not src: continue
if src.lower().find('covid-19-statistics')==-1: continue
for case in scrape_update(href):
print(case)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment