Created
August 4, 2020 06:34
-
-
Save a4amaan/7c4ce428ade96edace881a920e274d4c to your computer and use it in GitHub Desktop.
Scrape Data from WorldoMeter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from datetime import datetime, timezone, timedelta | |
import psycopg2 | |
from bs4 import BeautifulSoup | |
import requests | |
import re | |
from aman.countries import find | |
url = "https://www.worldometers.info/coronavirus/" | |
html_content = requests.get(url).text | |
soup = BeautifulSoup(html_content, "lxml") | |
table_today = soup.find("table", attrs={"id": "main_table_countries_today"}).find("tbody").find_all("tr") | |
table_yesterday = soup.find("table", attrs={"id": "main_table_countries_yesterday"}).find("tbody").find_all("tr") | |
table_yesterday2 = soup.find("table", attrs={"id": "main_table_countries_yesterday2"}).find("tbody").find_all("tr") | |
def save_worldo_data(countries): | |
now = datetime.now() | |
DB_HOST = os.environ.get('DB_HOST', '172.31.31.111') | |
DB_PASSWORD = os.environ.get('DB_PASSWORD', '123') | |
DB_CONN_STRING = 'postgresql://postgres:' + DB_PASSWORD + '@' + DB_HOST + ':5432/ds_pandemic_tracker' | |
connection = psycopg2.connect(DB_CONN_STRING) | |
cursor = connection.cursor() | |
for country in countries: | |
update_query = """ | |
update ak_pandemic_data set | |
"Confirmed_Worldo" = {confirmed}, | |
"Confirmed_Change_Worldo" = {confirmed_change}, | |
"Confirmed_Per_Million_Worldo" = {cases_per_million}, | |
"Deaths_Worldo" = {deaths}, "Deaths_Change_Worldo" = {deaths_change}, "Deaths_Per_Million_Worldo" = {deaths_per_million}, | |
"Recovered_Worldo" = {recovered}, "Recovered_Change_Worldo" = {recovered_change}, | |
"Active_Worldo" = {active}, "Serious_Worldo" = {serious}, | |
"Tests_Worldo" = {tests}, "Tests_Per_Million_Worldo" = {tests_per_million}, "Population_Worldo" = {population}, | |
"ISO" = '{iso_code}', "Date_Updated" = '{now}' | |
where "CountryCode" = '{country_code}' and "Date" = '{date}'; | |
""".format( | |
confirmed=country['confirmed'], confirmed_change=country['confirmed_change'], | |
cases_per_million=country['cases_per_million'], | |
deaths=country['deaths'], deaths_change=country['deaths_change'], | |
deaths_per_million=country['deaths_per_million'], | |
recovered=country['recovered'], recovered_change=country['recovered_change'], | |
active=country['active'], serious=country['serious'], tests=country['tests'], | |
tests_per_million=country['tests_per_million'], population=country['population'], | |
now=now, date=country['date'], iso_code=country['iso_code'], country=country['name'], | |
country_code=country['code'], | |
) | |
cursor.execute(update_query) | |
if cursor.rowcount == 0: | |
insert_query = """ | |
INSERT INTO ak_pandemic_data ("Country", "CountryCode", | |
"Confirmed_Worldo", "Confirmed_Change_Worldo", "Confirmed_Per_Million_Worldo", | |
"Deaths_Worldo", "Deaths_Change_Worldo", "Deaths_Per_Million_Worldo", | |
"Recovered_Worldo", "Recovered_Change_Worldo", "Active_Worldo", "Serious_Worldo", | |
"Tests_Worldo", "Tests_Per_Million_Worldo", "Population_Worldo", "Date", "Date_Created", "ISO") | |
VALUES ( | |
'{country}','{country_code}', | |
{confirmed},{confirmed_change},{cases_per_million}, | |
{deaths}, {deaths_change}, {deaths_per_million}, {recovered}, {recovered_change}, {active}, {serious}, | |
{tests}, {tests_per_million}, {population},'{date}', '{now}', '{iso_code}' | |
); | |
""".format( | |
confirmed=country['confirmed'], confirmed_change=country['confirmed_change'], | |
cases_per_million=country['cases_per_million'], | |
deaths=country['deaths'], deaths_change=country['deaths_change'], | |
deaths_per_million=country['deaths_per_million'], | |
recovered=country['recovered'], recovered_change=country['recovered_change'], | |
active=country['active'], serious=country['serious'], tests=country['tests'], | |
tests_per_million=country['tests_per_million'], population=country['population'], | |
now=now, date=country['date'], iso_code=country['iso_code'], country=country['name'], | |
country_code=country['code'], | |
) | |
cursor.execute(insert_query) | |
print('WORLDO AK INSERTED', country['code'], country['date'], country['confirmed'], | |
country['confirmed_change'], | |
country['deaths']) | |
else: | |
print('WORLDO AK UPDATED', country['code'], country['date'], country['confirmed'], country['confirmed_change'], | |
country['deaths']) | |
connection.commit() | |
connection.close() | |
def extract_worldo_data(table_rows, date): | |
countries = [] | |
for row in table_rows: | |
cells = row.find_all("td") | |
country_name = cells[1].get_text() | |
country = find(name_worldo=country_name) | |
if country: | |
country['confirmed'] = re.sub('[^0-9]', '', cells[2].get_text()) or 'NULL' | |
country['confirmed_change'] = re.sub('[^0-9]', '', cells[3].get_text()) or 'NULL' | |
country['deaths'] = re.sub('[^0-9]', '', cells[4].get_text()) or 'NULL' | |
country['deaths_change'] = re.sub('[^0-9]', '', cells[5].get_text()) or 'NULL' | |
country['recovered'] = re.sub('[^0-9]', '', cells[6].get_text()) or 'NULL' | |
country['recovered_change'] = re.sub('[^0-9]', '', cells[7].get_text()) or 'NULL' | |
country['active'] = re.sub('[^0-9]', '', cells[8].get_text()) or 'NULL' | |
country['serious'] = re.sub('[^0-9]', '', cells[9].get_text()) or 'NULL' | |
country['cases_per_million'] = re.sub('[^0-9]', '', cells[10].get_text()) or 'NULL' | |
country['deaths_per_million'] = re.sub('[^0-9]', '', cells[11].get_text()) or 'NULL' | |
country['tests'] = re.sub('[^0-9]', '', cells[12].get_text()) or 'NULL' | |
country['tests_per_million'] = re.sub('[^0-9]', '', cells[13].get_text()) or 'NULL' | |
country['population'] = re.sub('[^0-9]', '', cells[14].get_text()) or 'NULL' | |
country['date'] = date | |
countries.append(country) | |
else: | |
print('"{}"'.format(country_name) + 'Not Found in Dictionary') | |
return countries | |
def update_worldo_today_data(): | |
today = datetime.now(timezone.utc) | |
countries = extract_worldo_data(table_today, today.strftime("%Y-%m-%d")) | |
save_worldo_data(countries) | |
def update_worldo_yesterday_data(): | |
today = datetime.now(timezone.utc) | |
yesterday = today - timedelta(days=1) | |
countries = extract_worldo_data(table_yesterday, yesterday.strftime("%Y-%m-%d")) | |
save_worldo_data(countries) | |
def update_worldo_yesterday2_data(): | |
today = datetime.now(timezone.utc) | |
yesterday = today - timedelta(days=1) | |
yesterday2 = yesterday - timedelta(days=1) | |
countries = extract_worldo_data(table_yesterday2, yesterday2.strftime("%Y-%m-%d")) | |
save_worldo_data(countries) | |
if __name__ == '__main__': | |
today = datetime.now(timezone.utc) | |
yesterday = today - timedelta(days=1) | |
yesterday2 = yesterday - timedelta(days=1) | |
print(today) | |
print(yesterday) | |
print(yesterday2) | |
countries = extract_worldo_data(table_today, today.strftime("%Y-%m-%d")) | |
save_worldo_data(countries) | |
countries = extract_worldo_data(table_yesterday, yesterday.strftime("%Y-%m-%d")) | |
save_worldo_data(countries) | |
countries = extract_worldo_data(table_yesterday2, yesterday2.strftime("%Y-%m-%d")) | |
save_worldo_data(countries) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment