Created
May 7, 2012 23:52
-
-
Save jasonrdsouza/2631501 to your computer and use it in GitHub Desktop.
Example web scraping in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
webscrape.py - Python module to allow scraping data off of a website. | |
About: put the about here! | |
@author: Jason Dsouza | |
''' | |
__author__ = ('jasonrdsouza (Jason Dsouza)') | |
import urllib2 | |
import string | |
import csv | |
from BeautifulSoup import BeautifulSoup # html parser | |
URL1 = 'http://dpr.go.id/id/anggota/' | |
URL2 = 'http://www.mpr.go.id/profil/anggota/?periode=1&N=' | |
Unfetched_URLs = [] | |
def fetchWebpage(url): | |
'''Helper function that takes a URL and returns a file object | |
whose contents are the html contents of the webpage''' | |
return urllib2.urlopen(url) | |
def parseNames_DPR(f): | |
'''Gets all the names from the URL1 site, and returns | |
them as a list. The input parameter 'f' should be of | |
type file.''' | |
nameList = [] | |
soup = BeautifulSoup(f.read()) | |
# print soup.prettify() # pretty prints the html | |
for line in soup('tr', align="center"): | |
# append to the list of (name, id) tuples | |
nameList.append((line('td')[1].text, line.attrMap['id'])) | |
return nameList | |
def parseLinks_MPR(f): | |
'''Gets all the names from the URL2 site, and returns | |
them as a list. The input parameter 'f' should be of | |
type file.''' | |
linkList = [] | |
soup = BeautifulSoup(f.read()) | |
l = soup('a', {'class' : 'detail-view'}) | |
for element in l: | |
linkList.append(element.attrMap['href']) | |
return linkList | |
def generateSubURLs_DPR(nameList): | |
'''Take the list of elements necessary, and generate a list | |
of subdomain URL's that contain the actual data of | |
interest''' | |
subURLlist = [] | |
for (name,id_num) in nameList: | |
tempName = name.lower() | |
tempName = tempName.replace(' ', '-') | |
subURL = 'http://dpr.go.id/id/anggota/2009/{url_id}/{url_name}'.format(url_id=int(id_num), url_name=tempName) | |
subURLlist.append(subURL) | |
return subURLlist | |
def formatText(text): | |
'''Helper function to format the result text. This involves: | |
- removing periods and commas | |
- replace punctuation with a space | |
- replace multiple spaces with a single space''' | |
exclude = set(string.punctuation) | |
text = ''.join(ch for ch in text if ch not in exclude) | |
return text.lower() | |
def extractSubURLInfo_DPR(subURL): | |
'''Function to extract a specific persons information | |
given that person's sub-url on the DPR site''' | |
try: | |
f = fetchWebpage(subURL) | |
except urllib2.HTTPError, e: | |
print 'Could not fetch: {}'.format(subURL) | |
Unfetched_URLs.append(subURL) | |
return [] | |
except urllib2.URLError, e: | |
print 'Could not fetch: {}'.format(subURL) | |
Unfetched_URLs.append(subURL) | |
return [] | |
soup = BeautifulSoup(f.read()) | |
# get the relevant table from the webpage | |
dataTable = soup('table')[2] | |
# extract the relevant data from the table | |
fullName = dataTable('td')[2].text | |
affiliation = dataTable('td')[3].text | |
area_elected = dataTable('td')[4].text | |
fraction = dataTable('td')[5].text | |
# format the extracted data | |
fullName = formatText(fullName) | |
affiliation = formatText(affiliation) | |
area_elected = formatText(area_elected) | |
fraction = formatText(fraction) | |
nameSplit = fullName.split(' ') | |
familyName = nameSplit[len(nameSplit)-1] | |
givenName = '' | |
# find the first name, ignoring titles | |
# (assuming titles are all 3 characters or less) | |
i = 0 | |
while i<len(nameSplit): | |
if len(nameSplit[i])<4: | |
i +=1 | |
else: | |
givenName = nameSplit[i] | |
i = len(nameSplit)+1 | |
return [givenName, familyName, area_elected, affiliation, fullName, fraction] | |
def extractSubURLInfo_MPR(subURL): | |
'''Function to extract a specific persons information | |
given that person's sub-url on the MPR site''' | |
try: | |
f = fetchWebpage(subURL) | |
except urllib2.HTTPError, e: | |
print 'Could not fetch: {}'.format(subURL) | |
Unfetched_URLs.append(subURL) | |
return [] | |
except urllib2.URLError, e: | |
print 'Could not fetch: {}'.format(subURL) | |
Unfetched_URLs.append(subURL) | |
return [] | |
soup = BeautifulSoup(f.read()) | |
# get the relevant table from the webpage | |
dataTable = soup('table')[0] | |
# extract the relevant data from the table | |
fullName = dataTable('td')[7].text | |
affiliation = dataTable('td')[19].text | |
area_elected = dataTable('td')[16].text | |
fraction = dataTable('td')[13].text | |
birthinfo = dataTable('td')[10].text | |
birthplace = birthinfo.split(',')[0] | |
birthhack = birthinfo.split(',') | |
birthtemp = birthinfo.split(',')[len(birthhack)-1] | |
birthday = birthtemp.split(' ')[1] | |
birthmonth = birthtemp.split(' ')[2] | |
birthyear = birthtemp.split(' ')[3] | |
# format the extracted data | |
fullName = formatText(fullName) | |
affiliation = formatText(affiliation) | |
area_elected = formatText(area_elected) | |
fraction = formatText(fraction) | |
birthplace = formatText(birthplace) | |
nameSplit = fullName.split(' ') | |
familyName = nameSplit[len(nameSplit)-1] | |
givenName = '' | |
# find the first name, ignoring titles | |
# (assuming titles are all 3 characters or less) | |
i = 0 | |
while i<len(nameSplit): | |
if len(nameSplit[i])<4: | |
i +=1 | |
else: | |
givenName = nameSplit[i] | |
i = len(nameSplit)+1 | |
return [givenName, familyName, area_elected, affiliation, fullName, fraction, birthplace, '{}-{}'.format(birthday, birthmonth), birthyear] | |
def listToExcel_DPR(values, filename): | |
'''Function to take a 2-dimensional list of values and | |
turn it into an excel workbook''' | |
with open('{}.csv'.format(filename), 'wb') as f: | |
writer = csv.writer(f, dialect=csv.excel) | |
writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction']) | |
for line in values: | |
writer.writerow(line) | |
def listToExcel_MPR(values, filename): | |
'''Function to take a 2-dimensional list of values and | |
turn it into an excel workbook''' | |
with open('{}.csv'.format(filename), 'wb') as f: | |
writer = csv.writer(f, dialect=csv.excel) | |
writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction', 'Place of Birth', 'Date of Birth', 'Year of Birth']) | |
for line in values: | |
writer.writerow(line) | |
if __name__ == "__main__": | |
# generate DPR spreadsheet | |
''' | |
f = fetchWebpage(URL1) | |
print "webpage fetched" | |
nameList = parseNames_DPR(f) | |
print "namelist acquired" | |
subURLlist = generateSubURLs_DPR(nameList) | |
print "subURL list generated" | |
infoList = [] | |
for subURL in subURLlist: | |
print "getting data from: {}".format(subURL) | |
infoList.append(extractSubURLInfo_DPR(subURL)) | |
print "infolist constructed" | |
listToExcel_DPR(infoList, 'dpr') | |
print "excel sheet written" | |
print "Unfetched URL's:" | |
for item in Unfetched_URLs: | |
print '\t{}'.format(item) | |
''' | |
# generate MPR spreadsheet | |
links = [] | |
for i in range(1,36): | |
temp_url = URL2 + str(i) | |
f = fetchWebpage(temp_url) | |
print "webpage {} fetched".format(i) | |
tempLinksList = parseLinks_MPR(f) | |
links.extend(tempLinksList) | |
print "all links extracted" | |
infoList = [] | |
for link in links: | |
print "getting data from: {}".format(link) | |
infoList.append(extractSubURLInfo_MPR(link)) | |
print "infolist constructed" | |
listToExcel_MPR(infoList, 'mpr') | |
print "excel sheet written" | |
print "Unfetched URL's:" | |
for item in Unfetched_URLs: | |
print '\t{}'.format(item) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment