Skip to content

Instantly share code, notes, and snippets.

@a4amaan
Created July 16, 2020 12:29
Show Gist options
  • Save a4amaan/40e795e5b958338bcf4e9e54f99d9634 to your computer and use it in GitHub Desktop.
Save a4amaan/40e795e5b958338bcf4e9e54f99d9634 to your computer and use it in GitHub Desktop.
Scrape Data from PDF Documents
tables = tabula.read_pdf('https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200715-covid-19-sitrep-177.pdf', pages="all", multiple_tables=True)
countries = []
for data_frame in tables:
data_dicts = data_frame.T.to_dict().values()
for data_item in data_dicts:
for key in data_item:
if 'Country' in key: # if key is 'Reporting Country/Territory/Area'
country = find(who_name=data_item[key]) # if Country Exists in the List
if country:
confirmed = data_item.get('Total confirmed', None)
confirmed_new = data_item.get('Total confirmed.1', None)
deaths = data_item.get('Total deaths', None)
deaths_new = data_item.get('Total new deaths', None)
if isinstance(confirmed, str):
confirmed = confirmed.replace(' ', '')
confirmed = decimal.Decimal(confirmed)
if isinstance(confirmed_new, str):
confirmed_new = confirmed_new.replace(' ', '')
confirmed_new = decimal.Decimal(confirmed_new)
if isinstance(deaths, str):
deaths = deaths.replace(' ', '')
deaths = decimal.Decimal(deaths)
if isinstance(deaths_new, str):
deaths_new = deaths_new.replace(' ', '')
deaths_new = decimal.Decimal(deaths_new)
country['confirmed'] = replace_nam(confirmed)
country['confirmed_new'] = replace_nam(confirmed_new)
country['deaths'] = replace_nam(deaths)
country['deaths_new'] = replace_nam(deaths_new)
country['date'] = date
countries.append(country)
return countries
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment