Skip to content

Instantly share code, notes, and snippets.

@saipraveenkondapalli
Last active May 30, 2020 05:04
Show Gist options
  • Save saipraveenkondapalli/770bfa2c87218a2a01ea7c7534c7c5e1 to your computer and use it in GitHub Desktop.
Save saipraveenkondapalli/770bfa2c87218a2a01ea7c7534c7c5e1 to your computer and use it in GitHub Desktop.
web scrapping usingg Beautifulsoup in python from wikipedia.
import urllib.request as req
from bs4 import BeautifulSoup
import pandas as pd
import xlwt.Workbook
# you have to understand the layout of the wikipedia page for this code. visit the site and inspect the page html code
# our goal is to extract the data form the web page and save the country name ,active cases etc, save them in a list
# and print the data in a tabular form using pandas and also write the data to a excel file
url = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory'
html = req.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
c = soup.findAll('th', scope='row')
count = 0
names = []
active = []
recovered = []
deaths = []
# We have names of the countries only from 13 to 469 with some other tags along with it on in the list C.
# After inspecting the page, you can observe country names starts form 13 only in odd places until 469.
for x in range(13, 469):
count += 1
if count % 2 == 0:
continue
names.append(c[x].text)
# for active ,recovers and death cases
ap = soup.findAll('td')
count = 0
a = 10
r = 11
d = 12
while a < 922:
a = a + 4
r = r + 4
d = d + 4
active.append(ap[a].text)
deaths.append(ap[d].text)
recovered.append(ap[r].text)
# each in the lists names ,active, recovered and deaths has a new line character
# to remove the new line character we need to write a for loop
country_names = []
active_cases = []
recovered_cases = []
death_cases = []
# to remove new line characters from names, active recovered and deaths
for n in names:
y = str(n)
country_names.append(y.replace("\n", ""))
for a in active:
q = str(a)
active_cases.append(q.replace("\n", ""))
for d in deaths:
w = str(d)
death_cases.append(w.replace("\n", ""))
for r in recovered:
e = str(r)
recovered_cases.append(e.replace("\n", ""))
# to print the data in the form table we use pandas data frame
df = pd.DataFrame({'country': country_names,
'Active': active_cases,
'Recovered': recovered_cases,
'Deaths': death_cases})
# to start the index form 1 reset index
df.index = [x for x in range(1, len(names) + 1)]
print(df)
# writing data to a excel sheet
wb = xlwt.Workbook()
sheet1 = wb.add_sheet("Covid-19")
sheet1.write(0, 0, "Country")
sheet1.write(0, 1, "Active Cases")
sheet1.write(0, 2, "Recovered Cases")
sheet1.write(0, 3, "Death Cases")
# Adjusting width of the columns
sheet1.col(0).width = 256 * 25
sheet1.col(1).widht = 256 * 20
sheet1.col(2).width = 256 * 20
sheet1.col(3).widht = 256 * 20
# Writing Data to the columns in the Excel file
for i in range(len(country_names)):
sheet1.write(i+1, 0, country_names[i])
sheet1.write(i + 1, 1, active_cases[i])
sheet1.write(i + 1, 2, recovered_cases[i])
sheet1.write(i + 1, 3, death_cases[i])
# Save the Excel file
wb.save("covid.xls")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment