Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Created December 20, 2017 06:34
Show Gist options
  • Save reachsumit/449c6d1921ee394fcf4252026111adad to your computer and use it in GitHub Desktop.
Save reachsumit/449c6d1921ee394fcf4252026111adad to your computer and use it in GitHub Desktop.
Scraping population data for each Indian district
from bs4 import BeautifulSoup as bs
import requests
# Base url for scraping
res=requests.get('http://www.citypopulation.de/India.html')
# delicious soup
soup= bs(res.text,'lxml')
# path to the required links
allLinks = soup.select('ul li a')
# keep only the links with full district data
allLinks = allLinks[int(len(allLinks)/2):]
# form direct links to each state
state = []
links = []
for item in allLinks:
state.append(item.text)
links.append("http://www.citypopulation.de/"+item.attrs['href'])
# use Pandas to directly fetch table from the webpage
population_df = pd.DataFrame(data=None,columns=['State','District','Population in 2001','Population in 2011'])
for idx, link in enumerate(links):
temp_df = pd.read_html(link)[0]
for index, row in temp_df.iterrows():
if temp_df['Status'].iloc[index] == 'District':
population_df = population_df.append({'State':state[idx],'District':temp_df['Name'].iloc[index],'Population in 2001':temp_df['PopulationCensus2001-03-01'].iloc[index],'Population in 2011':temp_df['PopulationCensus2011-03-01'].iloc[index]}, ignore_index=True)
# save population data to csv
population_df.to_csv("population.csv",index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment