Last active
May 30, 2020 05:04
-
-
Save saipraveenkondapalli/770bfa2c87218a2a01ea7c7534c7c5e1 to your computer and use it in GitHub Desktop.
web scrapping usingg Beautifulsoup in python from wikipedia.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request as req | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import xlwt.Workbook | |
# you have to understand the layout of the wikipedia page for this code. visit the site and inspect the page html code | |
# our goal is to extract the data form the web page and save the country name ,active cases etc, save them in a list | |
# and print the data in a tabular form using pandas and also write the data to a excel file | |
url = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory' | |
html = req.urlopen(url) | |
soup = BeautifulSoup(html, 'html.parser') | |
c = soup.findAll('th', scope='row') | |
count = 0 | |
names = [] | |
active = [] | |
recovered = [] | |
deaths = [] | |
# We have names of the countries only from 13 to 469 with some other tags along with it on in the list C. | |
# After inspecting the page, you can observe country names starts form 13 only in odd places until 469. | |
for x in range(13, 469): | |
count += 1 | |
if count % 2 == 0: | |
continue | |
names.append(c[x].text) | |
# for active ,recovers and death cases | |
ap = soup.findAll('td') | |
count = 0 | |
a = 10 | |
r = 11 | |
d = 12 | |
while a < 922: | |
a = a + 4 | |
r = r + 4 | |
d = d + 4 | |
active.append(ap[a].text) | |
deaths.append(ap[d].text) | |
recovered.append(ap[r].text) | |
# each in the lists names ,active, recovered and deaths has a new line character | |
# to remove the new line character we need to write a for loop | |
country_names = [] | |
active_cases = [] | |
recovered_cases = [] | |
death_cases = [] | |
# to remove new line characters from names, active recovered and deaths | |
for n in names: | |
y = str(n) | |
country_names.append(y.replace("\n", "")) | |
for a in active: | |
q = str(a) | |
active_cases.append(q.replace("\n", "")) | |
for d in deaths: | |
w = str(d) | |
death_cases.append(w.replace("\n", "")) | |
for r in recovered: | |
e = str(r) | |
recovered_cases.append(e.replace("\n", "")) | |
# to print the data in the form table we use pandas data frame | |
df = pd.DataFrame({'country': country_names, | |
'Active': active_cases, | |
'Recovered': recovered_cases, | |
'Deaths': death_cases}) | |
# to start the index form 1 reset index | |
df.index = [x for x in range(1, len(names) + 1)] | |
print(df) | |
# writing data to a excel sheet | |
wb = xlwt.Workbook() | |
sheet1 = wb.add_sheet("Covid-19") | |
sheet1.write(0, 0, "Country") | |
sheet1.write(0, 1, "Active Cases") | |
sheet1.write(0, 2, "Recovered Cases") | |
sheet1.write(0, 3, "Death Cases") | |
# Adjusting width of the columns | |
sheet1.col(0).width = 256 * 25 | |
sheet1.col(1).widht = 256 * 20 | |
sheet1.col(2).width = 256 * 20 | |
sheet1.col(3).widht = 256 * 20 | |
# Writing Data to the columns in the Excel file | |
for i in range(len(country_names)): | |
sheet1.write(i+1, 0, country_names[i]) | |
sheet1.write(i + 1, 1, active_cases[i]) | |
sheet1.write(i + 1, 2, recovered_cases[i]) | |
sheet1.write(i + 1, 3, death_cases[i]) | |
# Save the Excel file | |
wb.save("covid.xls") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment