Skip to content

Instantly share code, notes, and snippets.

@msanatan
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msanatan/f42ab4e1a3f63ae65138 to your computer and use it in GitHub Desktop.
Save msanatan/f42ab4e1a3f63ae65138 to your computer and use it in GitHub Desktop.
Scraper for ttconnect Government Ministry page
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''A web scraper for the Government of Trindad and Tobago's Ministry data. The
main link: http://www.ttconnect.gov.tt/'''
from lxml import html
import requests
import json
from collections import OrderedDict
from argparse import ArgumentParser
BASE_URL = 'http://www.ttconnect.gov.tt'
URL = ('http://www.ttconnect.gov.tt/gortt/portal/ttconnect/SharedDetail/'
'?WCM_GLOBAL_CONTEXT=/gortt/wcm/connect/gortt+web+content/TTConnect'
'/Home/Government+Ministries')
HEADERS = {
'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)'
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/35.0.1916.47 Safari/537.36')
}
class Ministry(object):
'''This class store all possible data about a Government Ministry available
on ttconnect'''
def __init__(self, name, address='', telephone=None, fax=None, website=''):
'''Capture all data about the ministry
Args:
name (str): The Ministry's name. Mandatory field.
address (str): The Ministry's address
telephone (list of str): The Ministry's phone numbers. Up to three
telephone numbers are stored.
fax (list of str): The Ministry's fax numbers. Up to three fax
numbers are stored.
website (str): The Ministry's website link
'''
self.name = name
self.address = address
if telephone is None:
self.telephone = []
else:
self.telephone = telephone
if fax is None:
self.fax = []
else:
self.fax = fax
self.website = website
def __repr__(self):
'''Render Ministry object data'''
min_info = 'Ministry: {0}\nAddress: {1}'.format(self.name, self.address)
min_info += '\nTelephone numbers:'
for num in self.telephone:
min_info += '\n{0}'.format(num)
min_info += '\nFax numbers:'
for num in self.fax:
min_info += '\n{0}'.format(num)
min_info += '\nWebsite: {0}\n'.format(self.website)
return min_info
def to_csv(self, delimiter=','):
'''Export Ministry data to CSV format'''
result = '"{0}"{2}"{1}"{2}'.format(self.name, self.address, delimiter)
for i in xrange(3):
try:
tel_num = self.telephone[i]
except IndexError:
tel_num = ''
result += '{0}{1}'.format(tel_num, delimiter)
for i in xrange(3):
try:
fax_num = self.fax[i]
except IndexError:
fax_num = ''
result += '{0}{1}'.format(fax_num, delimiter)
result += self.website
return result
def to_json(self):
'''Export Ministry data to JSON format'''
result = OrderedDict([
('name', self.name),
('address', self.address),
('telephone', self.telephone),
('fax', self.fax),
('website', self.website)
])
return json.dumps(result, indent=4, separators=(',', ': '))
def parse_tel_nums(line, keyword):
'''Retrieve either telephone or fax numbers from a line'''
text = [x for x in line.split(keyword)][1].strip()
return [x.strip() for x in text.split(',')]
def scrape_ministry(url):
'''Given a url, scrape the website and return a Ministry object'''
try:
page = requests.get(url, headers=HEADERS)
except requests.exceptions.RequestException:
print 'Could not reach the website'
raise
else:
tree = html.fromstring(page.text)
content = tree.xpath('//div[@class="portletMainContent"]')[0]
name = content.xpath('./h3/text()')[0].strip()
remaining_text = [x.strip().replace('\r\n', ' ') for x in content.xpath('.//p/text()')]
address = remaining_text[0].split('Address:')[1].strip()
telephone = parse_tel_nums(remaining_text[1], 'Telephone:')
fax = parse_tel_nums(remaining_text[2], 'Fax:')
try:
website = content.xpath('p/a/text()')[0].strip()
except IndexError:
website = ''
return Ministry(name, address, telephone, fax, website)
def scrape(url):
'''Return a list of Ministry objects available in the website'''
try:
page = requests.get(url, headers=HEADERS)
except requests.exceptions.RequestException:
print 'Could not reach the website'
raise
else:
tree = html.fromstring(page.text)
content = tree.xpath('//div[@class="portletMainContent"]')[0]
ministries = []
for link in content.xpath('.//li/a[1]/@href'):
min_url = BASE_URL + link.strip().replace(' ', '%20').replace('\n', '')
ministries.append(scrape_ministry(min_url))
return ministries
def pprint_ministries(ministries, style=None):
'''Format the output of the Ministry data appropriately'''
output = ''
if style == 'csv':
for ministry in ministries:
output += ministry.to_csv() + '\n'
elif style == 'json':
output += '['
sublength = len(ministries) - 1
for i in xrange(sublength):
output += ministries[i].to_json() + ','
output += ministries[-1].to_json() + ']'
else:
for ministry in ministries:
output += str(ministry) + '\n'
return output
def main():
'''Argument parser for program'''
parser = ArgumentParser(description='Gather all Trinidad and Tobago ' \
'Government Ministries and their contact data')
parser.add_argument('-f', '--file', dest='filename', metavar='File',
help='Write output to file')
group = parser.add_mutually_exclusive_group()
group.add_argument('-c', '--csv', action='store_true',
help='Present data in CSV format')
group.add_argument('-j', '--json', action='store_true',
help='Present data in JSON format')
args = parser.parse_args()
ministries = scrape(URL)
if args.csv or args.filename[-4:] == '.csv':
ministry_data = pprint_ministries(ministries, 'csv')
elif args.json or args.filename[-5:] == '.json':
ministry_data = pprint_ministries(ministries, 'json')
else:
ministry_data = pprint_ministries(ministries)
if args.filename:
with open(args.filename, 'w') as min_file:
min_file.write(ministry_data)
else:
print ministry_data
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment