msanatan/ttconnect_ministry_scraper.py

## ttconnect_ministry_scraper.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''A web scraper for the Government of Trindad and Tobago's Ministry data. The
main link: http://www.ttconnect.gov.tt/'''

from lxml import html
import requests
import json
from collections import OrderedDict
from argparse import ArgumentParser

BASE_URL = 'http://www.ttconnect.gov.tt'

URL = ('http://www.ttconnect.gov.tt/gortt/portal/ttconnect/SharedDetail/'
       '?WCM_GLOBAL_CONTEXT=/gortt/wcm/connect/gortt+web+content/TTConnect'
       '/Home/Government+Ministries')

HEADERS = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)'
                   'AppleWebKit/537.36 (KHTML, like Gecko) '
                   'Chrome/35.0.1916.47 Safari/537.36')
}

class Ministry(object):
    '''This class store all possible data about a Government Ministry available
    on ttconnect'''

    def __init__(self, name, address='', telephone=None, fax=None, website=''):
        '''Capture all data about the ministry

        Args:
            name (str): The Ministry's name. Mandatory field.
            address (str): The Ministry's address
            telephone (list of str): The Ministry's phone numbers. Up to three
                telephone numbers are stored.
            fax (list of str): The Ministry's fax numbers. Up to three fax
                numbers are stored.
            website (str): The Ministry's website link
        '''
        self.name = name
        self.address = address
        if telephone is None:
            self.telephone = []
        else:
            self.telephone = telephone
        if fax is None:
            self.fax = []
        else:
            self.fax = fax
        self.website = website


    def __repr__(self):
        '''Render Ministry object data'''
        min_info = 'Ministry: {0}\nAddress: {1}'.format(self.name, self.address)
        min_info += '\nTelephone numbers:'
        for num in self.telephone:
            min_info += '\n{0}'.format(num)
        min_info += '\nFax numbers:'
        for num in self.fax:
            min_info += '\n{0}'.format(num)
        min_info += '\nWebsite: {0}\n'.format(self.website)
        return min_info


    def to_csv(self, delimiter=','):
        '''Export Ministry data to CSV format'''
        result = '"{0}"{2}"{1}"{2}'.format(self.name, self.address, delimiter)
        for i in xrange(3):
            try:
                tel_num = self.telephone[i]
            except IndexError:
                tel_num = ''
            result += '{0}{1}'.format(tel_num, delimiter)
        for i in xrange(3):
            try:
                fax_num = self.fax[i]
            except IndexError:
                fax_num = ''
            result += '{0}{1}'.format(fax_num, delimiter)
        result += self.website
        return result


    def to_json(self):
        '''Export Ministry data to JSON format'''
        result = OrderedDict([
            ('name', self.name),
            ('address', self.address),
            ('telephone', self.telephone),
            ('fax', self.fax),
            ('website', self.website)
        ])
        return json.dumps(result, indent=4, separators=(',', ': '))


def parse_tel_nums(line, keyword):
    '''Retrieve either telephone or fax numbers from a line'''
    text = [x for x in line.split(keyword)][1].strip()
    return [x.strip() for x in text.split(',')]


def scrape_ministry(url):
    '''Given a url, scrape the website and return a Ministry object'''
    try:
        page = requests.get(url, headers=HEADERS)
    except requests.exceptions.RequestException:
        print 'Could not reach the website'
        raise
    else:
        tree = html.fromstring(page.text)
        content = tree.xpath('//div[@class="portletMainContent"]')[0]
        name = content.xpath('./h3/text()')[0].strip()
        remaining_text = [x.strip().replace('\r\n', ' ') for x in content.xpath('.//p/text()')]
        address = remaining_text[0].split('Address:')[1].strip()
        telephone = parse_tel_nums(remaining_text[1], 'Telephone:')
        fax = parse_tel_nums(remaining_text[2], 'Fax:')
        try:
            website = content.xpath('p/a/text()')[0].strip()
        except IndexError:
            website = ''
        return Ministry(name, address, telephone, fax, website)


def scrape(url):
    '''Return a list of Ministry objects available in the website'''
    try:
        page = requests.get(url, headers=HEADERS)
    except requests.exceptions.RequestException:
        print 'Could not reach the website'
        raise
    else:
        tree = html.fromstring(page.text)
        content = tree.xpath('//div[@class="portletMainContent"]')[0]
        ministries = []
        for link in content.xpath('.//li/a[1]/@href'):
            min_url = BASE_URL + link.strip().replace(' ', '%20').replace('\n', '')
            ministries.append(scrape_ministry(min_url))
        return ministries


def pprint_ministries(ministries, style=None):
    '''Format the output of the Ministry data appropriately'''
    output = ''
    if style == 'csv':
        for ministry in ministries:
            output += ministry.to_csv() + '\n'
    elif style == 'json':
        output += '['
        sublength = len(ministries) - 1
        for i in xrange(sublength):
            output += ministries[i].to_json() + ','
        output += ministries[-1].to_json() + ']'
    else:
        for ministry in ministries:
            output += str(ministry) + '\n'
    return output


def main():
    '''Argument parser for program'''
    parser = ArgumentParser(description='Gather all Trinidad and Tobago ' \
                            'Government Ministries and their contact data')
    parser.add_argument('-f', '--file', dest='filename', metavar='File',
                        help='Write output to file')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-c', '--csv', action='store_true',
                       help='Present data in CSV format')
    group.add_argument('-j', '--json', action='store_true',
                       help='Present data in JSON format')
    args = parser.parse_args()

    ministries = scrape(URL)

    if args.csv or args.filename[-4:] == '.csv':
        ministry_data = pprint_ministries(ministries, 'csv')
    elif args.json or args.filename[-5:] == '.json':
        ministry_data = pprint_ministries(ministries, 'json')
    else:
        ministry_data = pprint_ministries(ministries)

    if args.filename:
        with open(args.filename, 'w') as min_file:
            min_file.write(ministry_data)
    else:
        print ministry_data

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	'''A web scraper for the Government of Trindad and Tobago's Ministry data. The
	main link: http://www.ttconnect.gov.tt/'''

	from lxml import html
	import requests
	import json
	from collections import OrderedDict
	from argparse import ArgumentParser

	BASE_URL = 'http://www.ttconnect.gov.tt'

	URL = ('http://www.ttconnect.gov.tt/gortt/portal/ttconnect/SharedDetail/'
	'?WCM_GLOBAL_CONTEXT=/gortt/wcm/connect/gortt+web+content/TTConnect'
	'/Home/Government+Ministries')

	HEADERS = {
	'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)'
	'AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/35.0.1916.47 Safari/537.36')
	}

	class Ministry(object):
	'''This class store all possible data about a Government Ministry available
	on ttconnect'''

	def __init__(self, name, address='', telephone=None, fax=None, website=''):
	'''Capture all data about the ministry

	Args:
	name (str): The Ministry's name. Mandatory field.
	address (str): The Ministry's address
	telephone (list of str): The Ministry's phone numbers. Up to three
	telephone numbers are stored.
	fax (list of str): The Ministry's fax numbers. Up to three fax
	numbers are stored.
	website (str): The Ministry's website link
	'''
	self.name = name
	self.address = address
	if telephone is None:
	self.telephone = []
	else:
	self.telephone = telephone
	if fax is None:
	self.fax = []
	else:
	self.fax = fax
	self.website = website


	def __repr__(self):
	'''Render Ministry object data'''
	min_info = 'Ministry: {0}\nAddress: {1}'.format(self.name, self.address)
	min_info += '\nTelephone numbers:'
	for num in self.telephone:
	min_info += '\n{0}'.format(num)
	min_info += '\nFax numbers:'
	for num in self.fax:
	min_info += '\n{0}'.format(num)
	min_info += '\nWebsite: {0}\n'.format(self.website)
	return min_info


	def to_csv(self, delimiter=','):
	'''Export Ministry data to CSV format'''
	result = '"{0}"{2}"{1}"{2}'.format(self.name, self.address, delimiter)
	for i in xrange(3):
	try:
	tel_num = self.telephone[i]
	except IndexError:
	tel_num = ''
	result += '{0}{1}'.format(tel_num, delimiter)
	for i in xrange(3):
	try:
	fax_num = self.fax[i]
	except IndexError:
	fax_num = ''
	result += '{0}{1}'.format(fax_num, delimiter)
	result += self.website
	return result


	def to_json(self):
	'''Export Ministry data to JSON format'''
	result = OrderedDict([
	('name', self.name),
	('address', self.address),
	('telephone', self.telephone),
	('fax', self.fax),
	('website', self.website)
	])
	return json.dumps(result, indent=4, separators=(',', ': '))


	def parse_tel_nums(line, keyword):
	'''Retrieve either telephone or fax numbers from a line'''
	text = [x for x in line.split(keyword)][1].strip()
	return [x.strip() for x in text.split(',')]


	def scrape_ministry(url):
	'''Given a url, scrape the website and return a Ministry object'''
	try:
	page = requests.get(url, headers=HEADERS)
	except requests.exceptions.RequestException:
	print 'Could not reach the website'
	raise
	else:
	tree = html.fromstring(page.text)
	content = tree.xpath('//div[@class="portletMainContent"]')[0]
	name = content.xpath('./h3/text()')[0].strip()
	remaining_text = [x.strip().replace('\r\n', ' ') for x in content.xpath('.//p/text()')]
	address = remaining_text[0].split('Address:')[1].strip()
	telephone = parse_tel_nums(remaining_text[1], 'Telephone:')
	fax = parse_tel_nums(remaining_text[2], 'Fax:')
	try:
	website = content.xpath('p/a/text()')[0].strip()
	except IndexError:
	website = ''
	return Ministry(name, address, telephone, fax, website)


	def scrape(url):
	'''Return a list of Ministry objects available in the website'''
	try:
	page = requests.get(url, headers=HEADERS)
	except requests.exceptions.RequestException:
	print 'Could not reach the website'
	raise
	else:
	tree = html.fromstring(page.text)
	content = tree.xpath('//div[@class="portletMainContent"]')[0]
	ministries = []
	for link in content.xpath('.//li/a[1]/@href'):
	min_url = BASE_URL + link.strip().replace(' ', '%20').replace('\n', '')
	ministries.append(scrape_ministry(min_url))
	return ministries


	def pprint_ministries(ministries, style=None):
	'''Format the output of the Ministry data appropriately'''
	output = ''
	if style == 'csv':
	for ministry in ministries:
	output += ministry.to_csv() + '\n'
	elif style == 'json':
	output += '['
	sublength = len(ministries) - 1
	for i in xrange(sublength):
	output += ministries[i].to_json() + ','
	output += ministries[-1].to_json() + ']'
	else:
	for ministry in ministries:
	output += str(ministry) + '\n'
	return output


	def main():
	'''Argument parser for program'''
	parser = ArgumentParser(description='Gather all Trinidad and Tobago ' \
	'Government Ministries and their contact data')
	parser.add_argument('-f', '--file', dest='filename', metavar='File',
	help='Write output to file')
	group = parser.add_mutually_exclusive_group()
	group.add_argument('-c', '--csv', action='store_true',
	help='Present data in CSV format')
	group.add_argument('-j', '--json', action='store_true',
	help='Present data in JSON format')
	args = parser.parse_args()

	ministries = scrape(URL)

	if args.csv or args.filename[-4:] == '.csv':
	ministry_data = pprint_ministries(ministries, 'csv')
	elif args.json or args.filename[-5:] == '.json':
	ministry_data = pprint_ministries(ministries, 'json')
	else:
	ministry_data = pprint_ministries(ministries)

	if args.filename:
	with open(args.filename, 'w') as min_file:
	min_file.write(ministry_data)
	else:
	print ministry_data

	if __name__ == '__main__':
	main()