simeonmiteff/scrape_za_nicd_covid19.py

## scrape_za_nicd_covid19.py
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import urllib.request

def scrape_update(url):
    with urllib.request.urlopen(url) as response:
        html_doc = response.read()
        soup = BeautifulSoup(html_doc, 'html.parser')
        # Not sure how stable indexing into the divs will be
        div = soup.find_all(class_="elementor-widget-container")[9]
        province = None
        for c in div.children:
            if c.name == 'p':
                s = str(c.contents).lower()
                if len(s)<=50 and s.find('province') != -1:
                    province = list(c.children)[0].contents[0].strip()
            if province and c.name == 'ul':
                for c in c.children:
                    if c.name == 'li':
                        case_string = c.contents[0]
                        yield {'province':province,'case_string':case_string}

if __name__ == "__main__":
    # FIXME: This does not deal with pagination
    with urllib.request.urlopen('http://www.nicd.ac.za/media/alerts/') as response:
        html_doc = response.read()
        soup = BeautifulSoup(html_doc, 'html.parser')
        articles = soup.find_all('article')
        for a in articles:
            link = a.find(class_="elementor-post__thumbnail__link")
            href = link.get('href')
            if not href: continue
            if href.find('covid-19-update')==-1: continue
            img = link.find('img')
            src = img.get('src')
            if not src: continue
            if src.lower().find('covid-19-statistics')==-1: continue
            for case in scrape_update(href):
                print(case)
	#!/usr/bin/env python3
	from bs4 import BeautifulSoup
	import urllib.request

	def scrape_update(url):
	with urllib.request.urlopen(url) as response:
	html_doc = response.read()
	soup = BeautifulSoup(html_doc, 'html.parser')
	# Not sure how stable indexing into the divs will be
	div = soup.find_all(class_="elementor-widget-container")[9]
	province = None
	for c in div.children:
	if c.name == 'p':
	s = str(c.contents).lower()
	if len(s)<=50 and s.find('province') != -1:
	province = list(c.children)[0].contents[0].strip()
	if province and c.name == 'ul':
	for c in c.children:
	if c.name == 'li':
	case_string = c.contents[0]
	yield {'province':province,'case_string':case_string}

	if __name__ == "__main__":
	# FIXME: This does not deal with pagination
	with urllib.request.urlopen('http://www.nicd.ac.za/media/alerts/') as response:
	html_doc = response.read()
	soup = BeautifulSoup(html_doc, 'html.parser')
	articles = soup.find_all('article')
	for a in articles:
	link = a.find(class_="elementor-post__thumbnail__link")
	href = link.get('href')
	if not href: continue
	if href.find('covid-19-update')==-1: continue
	img = link.find('img')
	src = img.get('src')
	if not src: continue
	if src.lower().find('covid-19-statistics')==-1: continue
	for case in scrape_update(href):
	print(case)