jasonrdsouza/webscrape.py

## webscrape.py
'''
webscrape.py - Python module to allow scraping data off of a website.

About: put the about here!

@author: Jason Dsouza
'''

__author__ = ('jasonrdsouza (Jason Dsouza)')


import urllib2
import string
import csv
from BeautifulSoup import BeautifulSoup # html parser

URL1 = 'http://dpr.go.id/id/anggota/'
URL2 = 'http://www.mpr.go.id/profil/anggota/?periode=1&N='
Unfetched_URLs = []

def fetchWebpage(url):
    '''Helper function that takes a URL and returns a file object
       whose contents are the html contents of the webpage'''
    return urllib2.urlopen(url)

def parseNames_DPR(f):
    '''Gets all the names from the URL1 site, and returns
       them as a list. The input parameter 'f' should be of
       type file.'''
    nameList = []
    soup = BeautifulSoup(f.read())
    # print soup.prettify()  # pretty prints the html
    for line in soup('tr', align="center"):
        # append to the list of (name, id) tuples
        nameList.append((line('td')[1].text, line.attrMap['id']))
    return nameList

def parseLinks_MPR(f):
    '''Gets all the names from the URL2 site, and returns
       them as a list. The input parameter 'f' should be of
       type file.'''
    linkList = []
    soup = BeautifulSoup(f.read())
    l = soup('a', {'class' : 'detail-view'})
    for element in l:
        linkList.append(element.attrMap['href'])
    return linkList

def generateSubURLs_DPR(nameList):
    '''Take the list of elements necessary, and generate a list
       of subdomain URL's that contain the actual data of
       interest'''
    subURLlist = []
    for (name,id_num) in nameList:
        tempName = name.lower()
        tempName = tempName.replace(' ', '-')
        subURL = 'http://dpr.go.id/id/anggota/2009/{url_id}/{url_name}'.format(url_id=int(id_num), url_name=tempName)
        subURLlist.append(subURL)
    return subURLlist

def formatText(text):
    '''Helper function to format the result text. This involves:
       - removing periods and commas
       - replace punctuation with a space
       - replace multiple spaces with a single space'''
    exclude = set(string.punctuation)
    text = ''.join(ch for ch in text if ch not in exclude)
    return text.lower()

def extractSubURLInfo_DPR(subURL):
    '''Function to extract a specific persons information
       given that person's sub-url on the DPR site'''
    try:
        f = fetchWebpage(subURL)
    except urllib2.HTTPError, e:
        print 'Could not fetch: {}'.format(subURL)
        Unfetched_URLs.append(subURL)
        return []
    except urllib2.URLError, e:
        print 'Could not fetch: {}'.format(subURL)
        Unfetched_URLs.append(subURL)
        return []
    soup = BeautifulSoup(f.read())
    # get the relevant table from the webpage
    dataTable = soup('table')[2]
    # extract the relevant data from the table
    fullName = dataTable('td')[2].text
    affiliation = dataTable('td')[3].text
    area_elected = dataTable('td')[4].text
    fraction = dataTable('td')[5].text
    # format the extracted data
    fullName = formatText(fullName)
    affiliation = formatText(affiliation)
    area_elected = formatText(area_elected)
    fraction = formatText(fraction)
    nameSplit = fullName.split(' ')
    familyName = nameSplit[len(nameSplit)-1]
    givenName = ''
    # find the first name, ignoring titles
    # (assuming titles are all 3 characters or less)
    i = 0
    while i<len(nameSplit):
        if len(nameSplit[i])<4:
            i +=1
        else:
            givenName = nameSplit[i]
            i = len(nameSplit)+1
    return [givenName, familyName, area_elected, affiliation, fullName, fraction]

def extractSubURLInfo_MPR(subURL):
    '''Function to extract a specific persons information
       given that person's sub-url on the MPR site'''
    try:
        f = fetchWebpage(subURL)
    except urllib2.HTTPError, e:
        print 'Could not fetch: {}'.format(subURL)
        Unfetched_URLs.append(subURL)
        return []
    except urllib2.URLError, e:
        print 'Could not fetch: {}'.format(subURL)
        Unfetched_URLs.append(subURL)
        return []
    soup = BeautifulSoup(f.read())
    # get the relevant table from the webpage
    dataTable = soup('table')[0]
    # extract the relevant data from the table
    fullName = dataTable('td')[7].text
    affiliation = dataTable('td')[19].text
    area_elected = dataTable('td')[16].text
    fraction = dataTable('td')[13].text
    birthinfo = dataTable('td')[10].text
    birthplace = birthinfo.split(',')[0]
    birthhack = birthinfo.split(',')
    birthtemp = birthinfo.split(',')[len(birthhack)-1]
    birthday = birthtemp.split(' ')[1]
    birthmonth = birthtemp.split(' ')[2]
    birthyear = birthtemp.split(' ')[3]
    # format the extracted data
    fullName = formatText(fullName)
    affiliation = formatText(affiliation)
    area_elected = formatText(area_elected)
    fraction = formatText(fraction)
    birthplace = formatText(birthplace)
    nameSplit = fullName.split(' ')
    familyName = nameSplit[len(nameSplit)-1]
    givenName = ''
    # find the first name, ignoring titles
    # (assuming titles are all 3 characters or less)
    i = 0
    while i<len(nameSplit):
        if len(nameSplit[i])<4:
            i +=1
        else:
            givenName = nameSplit[i]
            i = len(nameSplit)+1
    return [givenName, familyName, area_elected, affiliation, fullName, fraction, birthplace, '{}-{}'.format(birthday, birthmonth), birthyear]

def listToExcel_DPR(values, filename):
    '''Function to take a 2-dimensional list of values and
       turn it into an excel workbook'''
    with open('{}.csv'.format(filename), 'wb') as f:
        writer = csv.writer(f, dialect=csv.excel)
        writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction'])
        for line in values:
            writer.writerow(line)

def listToExcel_MPR(values, filename):
    '''Function to take a 2-dimensional list of values and
       turn it into an excel workbook'''
    with open('{}.csv'.format(filename), 'wb') as f:
        writer = csv.writer(f, dialect=csv.excel)
        writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction', 'Place of Birth', 'Date of Birth', 'Year of Birth'])
        for line in values:
            writer.writerow(line)


if __name__ == "__main__":
    # generate DPR spreadsheet
    '''
    f = fetchWebpage(URL1)
    print "webpage fetched"
    nameList = parseNames_DPR(f)
    print "namelist acquired"
    subURLlist = generateSubURLs_DPR(nameList)
    print "subURL list generated"
    infoList = []
    for subURL in subURLlist:
        print "getting data from: {}".format(subURL)
        infoList.append(extractSubURLInfo_DPR(subURL))
    print "infolist constructed"
    listToExcel_DPR(infoList, 'dpr')
    print "excel sheet written"
    print "Unfetched URL's:"
    for item in Unfetched_URLs:
        print '\t{}'.format(item)
    '''

    # generate MPR spreadsheet
    links = []
    for i in range(1,36):
        temp_url = URL2 + str(i)
        f = fetchWebpage(temp_url)
        print "webpage {} fetched".format(i)
        tempLinksList = parseLinks_MPR(f)
        links.extend(tempLinksList)
    print "all links extracted"
    infoList = []
    for link in links:
        print "getting data from: {}".format(link)
        infoList.append(extractSubURLInfo_MPR(link))
    print "infolist constructed"
    listToExcel_MPR(infoList, 'mpr')
    print "excel sheet written"
    print "Unfetched URL's:"
    for item in Unfetched_URLs:
        print '\t{}'.format(item)
	'''
	webscrape.py - Python module to allow scraping data off of a website.

	About: put the about here!

	@author: Jason Dsouza
	'''

	__author__ = ('jasonrdsouza (Jason Dsouza)')


	import urllib2
	import string
	import csv
	from BeautifulSoup import BeautifulSoup # html parser

	URL1 = 'http://dpr.go.id/id/anggota/'
	URL2 = 'http://www.mpr.go.id/profil/anggota/?periode=1&N='
	Unfetched_URLs = []

	def fetchWebpage(url):
	'''Helper function that takes a URL and returns a file object
	whose contents are the html contents of the webpage'''
	return urllib2.urlopen(url)

	def parseNames_DPR(f):
	'''Gets all the names from the URL1 site, and returns
	them as a list. The input parameter 'f' should be of
	type file.'''
	nameList = []
	soup = BeautifulSoup(f.read())
	# print soup.prettify() # pretty prints the html
	for line in soup('tr', align="center"):
	# append to the list of (name, id) tuples
	nameList.append((line('td')[1].text, line.attrMap['id']))
	return nameList

	def parseLinks_MPR(f):
	'''Gets all the names from the URL2 site, and returns
	them as a list. The input parameter 'f' should be of
	type file.'''
	linkList = []
	soup = BeautifulSoup(f.read())
	l = soup('a', {'class' : 'detail-view'})
	for element in l:
	linkList.append(element.attrMap['href'])
	return linkList

	def generateSubURLs_DPR(nameList):
	'''Take the list of elements necessary, and generate a list
	of subdomain URL's that contain the actual data of
	interest'''
	subURLlist = []
	for (name,id_num) in nameList:
	tempName = name.lower()
	tempName = tempName.replace(' ', '-')
	subURL = 'http://dpr.go.id/id/anggota/2009/{url_id}/{url_name}'.format(url_id=int(id_num), url_name=tempName)
	subURLlist.append(subURL)
	return subURLlist

	def formatText(text):
	'''Helper function to format the result text. This involves:
	- removing periods and commas
	- replace punctuation with a space
	- replace multiple spaces with a single space'''
	exclude = set(string.punctuation)
	text = ''.join(ch for ch in text if ch not in exclude)
	return text.lower()

	def extractSubURLInfo_DPR(subURL):
	'''Function to extract a specific persons information
	given that person's sub-url on the DPR site'''
	try:
	f = fetchWebpage(subURL)
	except urllib2.HTTPError, e:
	print 'Could not fetch: {}'.format(subURL)
	Unfetched_URLs.append(subURL)
	return []
	except urllib2.URLError, e:
	print 'Could not fetch: {}'.format(subURL)
	Unfetched_URLs.append(subURL)
	return []
	soup = BeautifulSoup(f.read())
	# get the relevant table from the webpage
	dataTable = soup('table')[2]
	# extract the relevant data from the table
	fullName = dataTable('td')[2].text
	affiliation = dataTable('td')[3].text
	area_elected = dataTable('td')[4].text
	fraction = dataTable('td')[5].text
	# format the extracted data
	fullName = formatText(fullName)
	affiliation = formatText(affiliation)
	area_elected = formatText(area_elected)
	fraction = formatText(fraction)
	nameSplit = fullName.split(' ')
	familyName = nameSplit[len(nameSplit)-1]
	givenName = ''
	# find the first name, ignoring titles
	# (assuming titles are all 3 characters or less)
	i = 0
	while i<len(nameSplit):
	if len(nameSplit[i])<4:
	i +=1
	else:
	givenName = nameSplit[i]
	i = len(nameSplit)+1
	return [givenName, familyName, area_elected, affiliation, fullName, fraction]

	def extractSubURLInfo_MPR(subURL):
	'''Function to extract a specific persons information
	given that person's sub-url on the MPR site'''
	try:
	f = fetchWebpage(subURL)
	except urllib2.HTTPError, e:
	print 'Could not fetch: {}'.format(subURL)
	Unfetched_URLs.append(subURL)
	return []
	except urllib2.URLError, e:
	print 'Could not fetch: {}'.format(subURL)
	Unfetched_URLs.append(subURL)
	return []
	soup = BeautifulSoup(f.read())
	# get the relevant table from the webpage
	dataTable = soup('table')[0]
	# extract the relevant data from the table
	fullName = dataTable('td')[7].text
	affiliation = dataTable('td')[19].text
	area_elected = dataTable('td')[16].text
	fraction = dataTable('td')[13].text
	birthinfo = dataTable('td')[10].text
	birthplace = birthinfo.split(',')[0]
	birthhack = birthinfo.split(',')
	birthtemp = birthinfo.split(',')[len(birthhack)-1]
	birthday = birthtemp.split(' ')[1]
	birthmonth = birthtemp.split(' ')[2]
	birthyear = birthtemp.split(' ')[3]
	# format the extracted data
	fullName = formatText(fullName)
	affiliation = formatText(affiliation)
	area_elected = formatText(area_elected)
	fraction = formatText(fraction)
	birthplace = formatText(birthplace)
	nameSplit = fullName.split(' ')
	familyName = nameSplit[len(nameSplit)-1]
	givenName = ''
	# find the first name, ignoring titles
	# (assuming titles are all 3 characters or less)
	i = 0
	while i<len(nameSplit):
	if len(nameSplit[i])<4:
	i +=1
	else:
	givenName = nameSplit[i]
	i = len(nameSplit)+1
	return [givenName, familyName, area_elected, affiliation, fullName, fraction, birthplace, '{}-{}'.format(birthday, birthmonth), birthyear]

	def listToExcel_DPR(values, filename):
	'''Function to take a 2-dimensional list of values and
	turn it into an excel workbook'''
	with open('{}.csv'.format(filename), 'wb') as f:
	writer = csv.writer(f, dialect=csv.excel)
	writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction'])
	for line in values:
	writer.writerow(line)

	def listToExcel_MPR(values, filename):
	'''Function to take a 2-dimensional list of values and
	turn it into an excel workbook'''
	with open('{}.csv'.format(filename), 'wb') as f:
	writer = csv.writer(f, dialect=csv.excel)
	writer.writerow(['Given Name', 'Family Name', 'Area Elected', 'Political Affiliation', 'Full Name', 'Fraction', 'Place of Birth', 'Date of Birth', 'Year of Birth'])
	for line in values:
	writer.writerow(line)


	if __name__ == "__main__":
	# generate DPR spreadsheet
	'''
	f = fetchWebpage(URL1)
	print "webpage fetched"
	nameList = parseNames_DPR(f)
	print "namelist acquired"
	subURLlist = generateSubURLs_DPR(nameList)
	print "subURL list generated"
	infoList = []
	for subURL in subURLlist:
	print "getting data from: {}".format(subURL)
	infoList.append(extractSubURLInfo_DPR(subURL))
	print "infolist constructed"
	listToExcel_DPR(infoList, 'dpr')
	print "excel sheet written"
	print "Unfetched URL's:"
	for item in Unfetched_URLs:
	print '\t{}'.format(item)
	'''

	# generate MPR spreadsheet
	links = []
	for i in range(1,36):
	temp_url = URL2 + str(i)
	f = fetchWebpage(temp_url)
	print "webpage {} fetched".format(i)
	tempLinksList = parseLinks_MPR(f)
	links.extend(tempLinksList)
	print "all links extracted"
	infoList = []
	for link in links:
	print "getting data from: {}".format(link)
	infoList.append(extractSubURLInfo_MPR(link))
	print "infolist constructed"
	listToExcel_MPR(infoList, 'mpr')
	print "excel sheet written"
	print "Unfetched URL's:"
	for item in Unfetched_URLs:
	print '\t{}'.format(item)