Skip to content

Instantly share code, notes, and snippets.

@a4amaan
Created August 4, 2020 06:34
Show Gist options
  • Save a4amaan/7c4ce428ade96edace881a920e274d4c to your computer and use it in GitHub Desktop.
Save a4amaan/7c4ce428ade96edace881a920e274d4c to your computer and use it in GitHub Desktop.
Scrape Data from WorldoMeter
import os
from datetime import datetime, timezone, timedelta
import psycopg2
from bs4 import BeautifulSoup
import requests
import re
from aman.countries import find
url = "https://www.worldometers.info/coronavirus/"
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
table_today = soup.find("table", attrs={"id": "main_table_countries_today"}).find("tbody").find_all("tr")
table_yesterday = soup.find("table", attrs={"id": "main_table_countries_yesterday"}).find("tbody").find_all("tr")
table_yesterday2 = soup.find("table", attrs={"id": "main_table_countries_yesterday2"}).find("tbody").find_all("tr")
def save_worldo_data(countries):
now = datetime.now()
DB_HOST = os.environ.get('DB_HOST', '172.31.31.111')
DB_PASSWORD = os.environ.get('DB_PASSWORD', '123')
DB_CONN_STRING = 'postgresql://postgres:' + DB_PASSWORD + '@' + DB_HOST + ':5432/ds_pandemic_tracker'
connection = psycopg2.connect(DB_CONN_STRING)
cursor = connection.cursor()
for country in countries:
update_query = """
update ak_pandemic_data set
"Confirmed_Worldo" = {confirmed},
"Confirmed_Change_Worldo" = {confirmed_change},
"Confirmed_Per_Million_Worldo" = {cases_per_million},
"Deaths_Worldo" = {deaths}, "Deaths_Change_Worldo" = {deaths_change}, "Deaths_Per_Million_Worldo" = {deaths_per_million},
"Recovered_Worldo" = {recovered}, "Recovered_Change_Worldo" = {recovered_change},
"Active_Worldo" = {active}, "Serious_Worldo" = {serious},
"Tests_Worldo" = {tests}, "Tests_Per_Million_Worldo" = {tests_per_million}, "Population_Worldo" = {population},
"ISO" = '{iso_code}', "Date_Updated" = '{now}'
where "CountryCode" = '{country_code}' and "Date" = '{date}';
""".format(
confirmed=country['confirmed'], confirmed_change=country['confirmed_change'],
cases_per_million=country['cases_per_million'],
deaths=country['deaths'], deaths_change=country['deaths_change'],
deaths_per_million=country['deaths_per_million'],
recovered=country['recovered'], recovered_change=country['recovered_change'],
active=country['active'], serious=country['serious'], tests=country['tests'],
tests_per_million=country['tests_per_million'], population=country['population'],
now=now, date=country['date'], iso_code=country['iso_code'], country=country['name'],
country_code=country['code'],
)
cursor.execute(update_query)
if cursor.rowcount == 0:
insert_query = """
INSERT INTO ak_pandemic_data ("Country", "CountryCode",
"Confirmed_Worldo", "Confirmed_Change_Worldo", "Confirmed_Per_Million_Worldo",
"Deaths_Worldo", "Deaths_Change_Worldo", "Deaths_Per_Million_Worldo",
"Recovered_Worldo", "Recovered_Change_Worldo", "Active_Worldo", "Serious_Worldo",
"Tests_Worldo", "Tests_Per_Million_Worldo", "Population_Worldo", "Date", "Date_Created", "ISO")
VALUES (
'{country}','{country_code}',
{confirmed},{confirmed_change},{cases_per_million},
{deaths}, {deaths_change}, {deaths_per_million}, {recovered}, {recovered_change}, {active}, {serious},
{tests}, {tests_per_million}, {population},'{date}', '{now}', '{iso_code}'
);
""".format(
confirmed=country['confirmed'], confirmed_change=country['confirmed_change'],
cases_per_million=country['cases_per_million'],
deaths=country['deaths'], deaths_change=country['deaths_change'],
deaths_per_million=country['deaths_per_million'],
recovered=country['recovered'], recovered_change=country['recovered_change'],
active=country['active'], serious=country['serious'], tests=country['tests'],
tests_per_million=country['tests_per_million'], population=country['population'],
now=now, date=country['date'], iso_code=country['iso_code'], country=country['name'],
country_code=country['code'],
)
cursor.execute(insert_query)
print('WORLDO AK INSERTED', country['code'], country['date'], country['confirmed'],
country['confirmed_change'],
country['deaths'])
else:
print('WORLDO AK UPDATED', country['code'], country['date'], country['confirmed'], country['confirmed_change'],
country['deaths'])
connection.commit()
connection.close()
def extract_worldo_data(table_rows, date):
countries = []
for row in table_rows:
cells = row.find_all("td")
country_name = cells[1].get_text()
country = find(name_worldo=country_name)
if country:
country['confirmed'] = re.sub('[^0-9]', '', cells[2].get_text()) or 'NULL'
country['confirmed_change'] = re.sub('[^0-9]', '', cells[3].get_text()) or 'NULL'
country['deaths'] = re.sub('[^0-9]', '', cells[4].get_text()) or 'NULL'
country['deaths_change'] = re.sub('[^0-9]', '', cells[5].get_text()) or 'NULL'
country['recovered'] = re.sub('[^0-9]', '', cells[6].get_text()) or 'NULL'
country['recovered_change'] = re.sub('[^0-9]', '', cells[7].get_text()) or 'NULL'
country['active'] = re.sub('[^0-9]', '', cells[8].get_text()) or 'NULL'
country['serious'] = re.sub('[^0-9]', '', cells[9].get_text()) or 'NULL'
country['cases_per_million'] = re.sub('[^0-9]', '', cells[10].get_text()) or 'NULL'
country['deaths_per_million'] = re.sub('[^0-9]', '', cells[11].get_text()) or 'NULL'
country['tests'] = re.sub('[^0-9]', '', cells[12].get_text()) or 'NULL'
country['tests_per_million'] = re.sub('[^0-9]', '', cells[13].get_text()) or 'NULL'
country['population'] = re.sub('[^0-9]', '', cells[14].get_text()) or 'NULL'
country['date'] = date
countries.append(country)
else:
print('"{}"'.format(country_name) + 'Not Found in Dictionary')
return countries
def update_worldo_today_data():
today = datetime.now(timezone.utc)
countries = extract_worldo_data(table_today, today.strftime("%Y-%m-%d"))
save_worldo_data(countries)
def update_worldo_yesterday_data():
today = datetime.now(timezone.utc)
yesterday = today - timedelta(days=1)
countries = extract_worldo_data(table_yesterday, yesterday.strftime("%Y-%m-%d"))
save_worldo_data(countries)
def update_worldo_yesterday2_data():
today = datetime.now(timezone.utc)
yesterday = today - timedelta(days=1)
yesterday2 = yesterday - timedelta(days=1)
countries = extract_worldo_data(table_yesterday2, yesterday2.strftime("%Y-%m-%d"))
save_worldo_data(countries)
if __name__ == '__main__':
today = datetime.now(timezone.utc)
yesterday = today - timedelta(days=1)
yesterday2 = yesterday - timedelta(days=1)
print(today)
print(yesterday)
print(yesterday2)
countries = extract_worldo_data(table_today, today.strftime("%Y-%m-%d"))
save_worldo_data(countries)
countries = extract_worldo_data(table_yesterday, yesterday.strftime("%Y-%m-%d"))
save_worldo_data(countries)
countries = extract_worldo_data(table_yesterday2, yesterday2.strftime("%Y-%m-%d"))
save_worldo_data(countries)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment