saipraveenkondapalli/Web Scrapping Covid 19.py

## Web Scrapping Covid 19.py
import urllib.request as req
from bs4 import BeautifulSoup
import pandas as pd
import xlwt.Workbook

# you have to understand the layout of the wikipedia page for  this code. visit the site and inspect the page html code
# our goal is to extract the data form the web page and save the country name ,active cases etc, save them in  a list
# and print the data in a tabular form using pandas and also write the data to a excel file
url = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory'
html = req.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
c = soup.findAll('th', scope='row')
count = 0
names = []
active = []
recovered = []
deaths = []
# We have names of the countries only from 13 to 469 with some other tags along with it on in  the list C.
# After inspecting the page, you can observe country names  starts form 13 only in odd places until 469.
for x in range(13, 469):
    count += 1
    if count % 2 == 0:
        continue
    names.append(c[x].text)

# for active ,recovers and death cases
ap = soup.findAll('td')
count = 0
a = 10
r = 11
d = 12

while a < 922:
    a = a + 4
    r = r + 4
    d = d + 4
    active.append(ap[a].text)
    deaths.append(ap[d].text)
    recovered.append(ap[r].text)
# each in the lists names ,active, recovered and deaths has a new line character
# to remove the new line character we need to write a for loop
country_names = []
active_cases = []
recovered_cases = []
death_cases = []
# to remove new line characters from names, active recovered and deaths
for n in names:
    y = str(n)
    country_names.append(y.replace("\n", ""))
for a in active:
    q = str(a)
    active_cases.append(q.replace("\n", ""))
for d in deaths:
    w = str(d)
    death_cases.append(w.replace("\n", ""))
for r in recovered:
    e = str(r)
    recovered_cases.append(e.replace("\n", ""))
# to print the data in the form table we use pandas data frame

df = pd.DataFrame({'country': country_names,
                   'Active': active_cases,
                   'Recovered': recovered_cases,
                   'Deaths': death_cases})
# to start the index form 1  reset index
df.index = [x for x in range(1, len(names) + 1)]
print(df)

# writing data to a excel sheet

wb = xlwt.Workbook()
sheet1 = wb.add_sheet("Covid-19")
sheet1.write(0, 0, "Country")
sheet1.write(0, 1, "Active Cases")
sheet1.write(0, 2, "Recovered Cases")
sheet1.write(0, 3, "Death Cases")
# Adjusting width of the columns
sheet1.col(0).width = 256 * 25
sheet1.col(1).widht = 256 * 20
sheet1.col(2).width = 256 * 20
sheet1.col(3).widht = 256 * 20
# Writing Data to the columns in the Excel file
for  i in range(len(country_names)):
    sheet1.write(i+1, 0, country_names[i])
    sheet1.write(i + 1, 1, active_cases[i])
    sheet1.write(i + 1, 2, recovered_cases[i])
    sheet1.write(i + 1, 3, death_cases[i])
# Save the Excel file
wb.save("covid.xls")
	import urllib.request as req
	from bs4 import BeautifulSoup
	import pandas as pd
	import xlwt.Workbook

	# you have to understand the layout of the wikipedia page for this code. visit the site and inspect the page html code
	# our goal is to extract the data form the web page and save the country name ,active cases etc, save them in a list
	# and print the data in a tabular form using pandas and also write the data to a excel file
	url = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory'
	html = req.urlopen(url)
	soup = BeautifulSoup(html, 'html.parser')
	c = soup.findAll('th', scope='row')
	count = 0
	names = []
	active = []
	recovered = []
	deaths = []
	# We have names of the countries only from 13 to 469 with some other tags along with it on in the list C.
	# After inspecting the page, you can observe country names starts form 13 only in odd places until 469.
	for x in range(13, 469):
	count += 1
	if count % 2 == 0:
	continue
	names.append(c[x].text)

	# for active ,recovers and death cases
	ap = soup.findAll('td')
	count = 0
	a = 10
	r = 11
	d = 12

	while a < 922:
	a = a + 4
	r = r + 4
	d = d + 4
	active.append(ap[a].text)
	deaths.append(ap[d].text)
	recovered.append(ap[r].text)
	# each in the lists names ,active, recovered and deaths has a new line character
	# to remove the new line character we need to write a for loop
	country_names = []
	active_cases = []
	recovered_cases = []
	death_cases = []
	# to remove new line characters from names, active recovered and deaths
	for n in names:
	y = str(n)
	country_names.append(y.replace("\n", ""))
	for a in active:
	q = str(a)
	active_cases.append(q.replace("\n", ""))
	for d in deaths:
	w = str(d)
	death_cases.append(w.replace("\n", ""))
	for r in recovered:
	e = str(r)
	recovered_cases.append(e.replace("\n", ""))
	# to print the data in the form table we use pandas data frame

	df = pd.DataFrame({'country': country_names,
	'Active': active_cases,
	'Recovered': recovered_cases,
	'Deaths': death_cases})
	# to start the index form 1 reset index
	df.index = [x for x in range(1, len(names) + 1)]
	print(df)

	# writing data to a excel sheet

	wb = xlwt.Workbook()
	sheet1 = wb.add_sheet("Covid-19")
	sheet1.write(0, 0, "Country")
	sheet1.write(0, 1, "Active Cases")
	sheet1.write(0, 2, "Recovered Cases")
	sheet1.write(0, 3, "Death Cases")
	# Adjusting width of the columns
	sheet1.col(0).width = 256 * 25
	sheet1.col(1).widht = 256 * 20
	sheet1.col(2).width = 256 * 20
	sheet1.col(3).widht = 256 * 20
	# Writing Data to the columns in the Excel file
	for i in range(len(country_names)):
	sheet1.write(i+1, 0, country_names[i])
	sheet1.write(i + 1, 1, active_cases[i])
	sheet1.write(i + 1, 2, recovered_cases[i])
	sheet1.write(i + 1, 3, death_cases[i])
	# Save the Excel file
	wb.save("covid.xls")