Skip to content

Instantly share code, notes, and snippets.

@jasonrdsouza
Created May 7, 2012 23:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jasonrdsouza/2631501 to your computer and use it in GitHub Desktop.
Save jasonrdsouza/2631501 to your computer and use it in GitHub Desktop.
Example web scraping in python
'''
webscrape.py - Python module to allow scraping data off of a website.
About: put the about here!
@author: Jason Dsouza
'''
__author__ = ('jasonrdsouza (Jason Dsouza)')
import urllib2
import string
import csv
from BeautifulSoup import BeautifulSoup # html parser
URL1 = 'http://dpr.go.id/id/anggota/'
URL2 = 'http://www.mpr.go.id/profil/anggota/?periode=1&N='
Unfetched_URLs = []
def fetchWebpage(url):
'''Helper function that takes a URL and returns a file object
whose contents are the html contents of the webpage'''
return urllib2.urlopen(url)
def parseNames_DPR(f):
'''Gets all the names from the URL1 site, and returns
them as a list. The input parameter 'f' should be of
type file.'''
nameList = []
soup = BeautifulSoup(f.read())
# print soup.prettify() # pretty prints the html
for line in soup('tr', align="center"):
# append to the list of (name, id) tuples
nameList.append((line('td')[1].text, line.attrMap['id']))
return nameList
def parseLinks_MPR(f):
'''Gets all the names from the URL2 site, and returns
them as a list. The input parameter 'f' should be of
type file.'''
linkList = []
soup = BeautifulSoup(f.read())
l = soup('a', {'class' : 'detail-view'})
for element in l:
linkList.append(element.attrMap['href'])
return linkList
def generateSubURLs_DPR(nameList):
'''Take the list of elements necessary, and generate a list
of subdomain URL's that contain the actual data of
interest'''
subURLlist = []
for (name,id_num) in nameList:
tempName = name.lower()
tempName = tempName.replace(' ', '-')
subURL = 'http://dpr.go.id/id/anggota/2009/{url_id}/{url_name}'.format(url_id=int(id_num), url_name=tempName)
subURLlist.append(subURL)
return subURLlist
def formatText(text):
'''Helper function to format the result text. This involves:
- removing periods and commas
- replace punctuation with a space
- replace multiple spaces with a single space'''
exclude = set(string.punctuation)
text = ''.join(ch for ch in text if ch not in exclude)
return text.lower()
def extractSubURLInfo_DPR(subURL):
'''Function to extract a specific persons information
given that person's sub-url on the DPR site'''
try:
f = fetchWebpage(subURL)
except urllib2.HTTPError, e:
print 'Could not fetch: {}'.format(subURL)
Unfetched_URLs.append(subURL)
return []
except urllib2.URLError, e:
print 'Could not fetch: {}'.format(subURL)
Unfetched_URLs.append(subURL)
return []
soup = BeautifulSoup(f.read())
# get the relevant table from the webpage
dataTable = soup('table')[2]
# extract the relevant data from the table
fullName = dataTable('td')[2].text
affiliation = dataTable('td')[3].text
area_elected = dataTable('td')[4].text
fraction = dataTable('td')[5].text
# format the extracted data
fullName = formatText(fullName)
affiliation = formatText(affiliation)
area_elected = formatText(area_elected)
fraction = formatText(fraction)
nameSplit = fullName.split(' ')
familyName = nameSplit[len(nameSplit)-1]
givenName = ''
# find the first name, ignoring titles
# (assuming titles are all 3 characters or less)
i = 0
while i<len(nameSplit):
if len(nameSplit[i])<4:
i +=1
else:
givenName = nameSplit[i]
i = len(nameSplit)+1
return [givenName, familyName, area_elected, affiliation, fullName, fraction]
def extractSubURLInfo_MPR(subURL):
'''Function to extract a specific persons information
given that person's sub-url on the MPR site'''
try:
f = fetchWebpage(subURL)
except urllib2.HTTPError, e:
print 'Could not fetch: {}'.format(subURL)
Unfetched_URLs.append(subURL)
return []
except urllib2.URLError, e:
print 'Could not fetch: {}'.format(subURL)
Unfetched_URLs.append(subURL)
return []
soup = BeautifulSoup(f.read())
# get the relevant table from the webpage
dataTable = soup('table')[0]
# extract the relevant data from the table
fullName = dataTable('td')[7].text
affiliation = dataTable('td')[19].text
area_elected = dataTable('td')[16].text
fraction = dataTable('td')[13].text
birthinfo = dataTable('td')[10].text
birthplace = birthinfo.split(',')[0]
birthhack = birthinfo.split(',')
birthtemp = birthinfo.split(',')[len(birthhack)-1]
birthday = birthtemp.split(' ')[1]
birthmonth = birthtemp.split(' ')[2]
birthyear = birthtemp.split(' ')[3]
# format the extracted data
fullName = formatText(fullName)
affiliation = formatText(affiliation)
area_elected = formatText(area_elected)
fraction = formatText(fraction)
birthplace = formatText(birthplace)
nameSplit = fullName.split(' ')
familyName = nameSplit[len(nameSplit)-1]
givenName = ''
# find the first name, ignoring titles
# (assuming titles are all 3 characters or less)
i = 0
while i<len(nameSplit):
if len(nameSplit[i])<4:
i +=1
else:
givenName = nameSplit[i]
i = len(nameSplit)+1
return [givenName, familyName, area_elected, affiliation, fullName, fraction, birthplace, '{}-{}'.format(birthday, birthmonth), birthyear]
def listToExcel_DPR(values, filename):
'''Function to take a 2-dimensional list of values and
turn it into an excel workbook'''
with open('{}.csv'.format(filename), 'wb') as f:
writer = csv.writer(f, dialect=csv.excel)
writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction'])
for line in values:
writer.writerow(line)
def listToExcel_MPR(values, filename):
'''Function to take a 2-dimensional list of values and
turn it into an excel workbook'''
with open('{}.csv'.format(filename), 'wb') as f:
writer = csv.writer(f, dialect=csv.excel)
writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction', 'Place of Birth', 'Date of Birth', 'Year of Birth'])
for line in values:
writer.writerow(line)
if __name__ == "__main__":
# generate DPR spreadsheet
'''
f = fetchWebpage(URL1)
print "webpage fetched"
nameList = parseNames_DPR(f)
print "namelist acquired"
subURLlist = generateSubURLs_DPR(nameList)
print "subURL list generated"
infoList = []
for subURL in subURLlist:
print "getting data from: {}".format(subURL)
infoList.append(extractSubURLInfo_DPR(subURL))
print "infolist constructed"
listToExcel_DPR(infoList, 'dpr')
print "excel sheet written"
print "Unfetched URL's:"
for item in Unfetched_URLs:
print '\t{}'.format(item)
'''
# generate MPR spreadsheet
links = []
for i in range(1,36):
temp_url = URL2 + str(i)
f = fetchWebpage(temp_url)
print "webpage {} fetched".format(i)
tempLinksList = parseLinks_MPR(f)
links.extend(tempLinksList)
print "all links extracted"
infoList = []
for link in links:
print "getting data from: {}".format(link)
infoList.append(extractSubURLInfo_MPR(link))
print "infolist constructed"
listToExcel_MPR(infoList, 'mpr')
print "excel sheet written"
print "Unfetched URL's:"
for item in Unfetched_URLs:
print '\t{}'.format(item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment