Last active
July 5, 2017 13:51
-
-
Save kartikv11/83492dec0db4867cab30a7b857a15af4 to your computer and use it in GitHub Desktop.
Example of parsing/scraping data using BeautifulSoup. Source used here: http://www.labour.delhigovt.nic.in/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import sys | |
import csv | |
sys.setrecursionlimit(5000) | |
# We've now imported the two packages that will do the heavy lifting | |
# for us, reqeusts and BeautifulSoup | |
# Let's put the URL of the page we want to scrape in a variable | |
# so that our code down below can be a little cleaner | |
# Down below we'll add our inmates to this list | |
inmates_list = [] | |
for i in range(1, 4401): | |
print "Iterating for Range: " + str(i) | |
url_to_scrape = 'http://www.labour.delhigovt.nic.in/ser/r_regstatus.asp?currentpage={}'.format(i) | |
# Tell requests to retreive the contents our page (it'll be grabbing | |
# what you see when you use the View Source feature in your browser) | |
r = requests.get(url_to_scrape) | |
# We now have the source of the page, let's ask BeaultifulSoup | |
# to parse it for us. | |
soup = BeautifulSoup(r.text, "html.parser") | |
local_list = [] | |
# BeautifulSoup provides nice ways to access the data in the parsed | |
# page. Here, we'll use the select method and pass it a CSS style | |
# selector to grab all the rows in the table | |
try: | |
for table_row in soup.select("table tr"): | |
# Each tr (table row) has three td HTML elements (most people | |
# call these table cels) in it (first name, last name, and age) | |
cells = table_row.find_all('td') | |
# Our table has one exception -- a row without any cells. | |
# Let's handle that special case here by making sure we | |
# have more than zero cells before processing the cells | |
if len(cells) > 0: | |
# Our first name seems to appear in the second td element | |
# that ends up being the cell called 1, since we start | |
# counting at 0 | |
data = cells[1].text.strip().encode('utf-8').split() | |
# Let's add our inmate to our list in case | |
# We do this by adding the values we want to a dictionary, and | |
# appending that dictionary to the list we created above | |
local_list.append(data[0]) | |
# Let's print our table out. | |
# print "Added {0}, to the list".format(reg_no) | |
except: | |
local_list.pop(0) | |
inmates_list = list(set(inmates_list + local_list)) | |
print "Range 100 x {}, Total data collected: {}".format(i, len(inmates_list)) | |
pass | |
inmates_list = list(set(inmates_list)) | |
inmate_details_list = [] | |
count = 0 | |
print "Now details Fetch Commences!" | |
fields = ['Reg. No.','Registration Date','Name','Category','Date of Commencement','e-Mail ID','Website URL', \ | |
'Name of Occupier','Father Name of Occupier','Name of Manager','Fathers Name of Manager','Nature of Business', \ | |
'Total Male Workers','Total Female Workers','Total Young Workers','Total Workers','Total Family Members Working in the Establishment', \ | |
'Total Employers Working in Confidential Capacity','Certificate No.','Certificate Date','Address'] | |
with open('/tmp/logFile.csv', 'a') as f: | |
writer = csv.writer(f) | |
writer.writerow(fields) | |
# # Loop through the list of inmates we built for fetching details | |
for inmate in inmates_list: | |
count = count + 1 | |
print "Details Fetched for: {}, Left: {}".format(count, len(inmates_list) - count) | |
url_to_scrape = 'http://www.labour.delhigovt.nic.in/ser/r_estdet.asp?cer={}'.format(inmate) | |
r = requests.get(url_to_scrape) | |
soup = BeautifulSoup(r.text, "html.parser") | |
try: | |
for table_row in soup.select("table tr"): | |
cells = table_row.find_all('td') | |
local_detailed_data = [] | |
if len(cells) > 0: | |
# Reg. No. | |
local_detailed_data.append(cells[1].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Registration Date | |
local_detailed_data.append(cells[3].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Name | |
local_detailed_data.append(cells[5].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Category | |
local_detailed_data.append(cells[7].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Date of Commencement | |
local_detailed_data.append(cells[9].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
local_detailed_data.append(cells[13].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Website URL | |
local_detailed_data.append(cells[15].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Name of Occupier | |
local_detailed_data.append(cells[17].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Father Name of Occupier | |
local_detailed_data.append(cells[19].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Name of Manager | |
local_detailed_data.append(cells[21].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Fathers Name of Manager | |
local_detailed_data.append(cells[23].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Nature of Business | |
local_detailed_data.append(cells[25].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Total Male Workers | |
local_detailed_data.append(cells[27].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Total Female Workers | |
local_detailed_data.append(cells[29].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Total Young Workers | |
local_detailed_data.append(cells[31].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Total Workers | |
local_detailed_data.append(cells[33].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Total Family Members Working in the Establishment | |
local_detailed_data.append(cells[35].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Total Employers Working in Confidential Capacity | |
local_detailed_data.append(cells[37].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
# Certificate No | |
local_detailed_data.append(cells[39].text.strip().encode('utf-8').split().replace(":", "").strip()) | |
# Certificate Date | |
local_detailed_data.append(cells[41].text.strip().encode('utf-8').split().replace(":", "").strip()) | |
# Address | |
local_detailed_data.append(cells[10].text.strip().encode('utf-8').strip().replace(":", "").strip()) | |
except: | |
pass | |
inmate_details_list.append(local_detailed_data) | |
print "Count: {}, List: {}".format(count, len(inmates_list)) | |
if ((count % 10) == 0) or (count == len(inmates_list)): | |
with open('/tmp/logFile.csv','a') as f: | |
writer = csv.writer(f) | |
for eachFieldList in inmate_details_list: | |
writer.writerow(eachFieldList) | |
inmate_details_list = [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment