Last active
August 29, 2015 14:15
-
-
Save msanatan/f42ab4e1a3f63ae65138 to your computer and use it in GitHub Desktop.
Scraper for ttconnect Government Ministry page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
'''A web scraper for the Government of Trindad and Tobago's Ministry data. The | |
main link: http://www.ttconnect.gov.tt/''' | |
from lxml import html | |
import requests | |
import json | |
from collections import OrderedDict | |
from argparse import ArgumentParser | |
BASE_URL = 'http://www.ttconnect.gov.tt' | |
URL = ('http://www.ttconnect.gov.tt/gortt/portal/ttconnect/SharedDetail/' | |
'?WCM_GLOBAL_CONTEXT=/gortt/wcm/connect/gortt+web+content/TTConnect' | |
'/Home/Government+Ministries') | |
HEADERS = { | |
'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/35.0.1916.47 Safari/537.36') | |
} | |
class Ministry(object): | |
'''This class store all possible data about a Government Ministry available | |
on ttconnect''' | |
def __init__(self, name, address='', telephone=None, fax=None, website=''): | |
'''Capture all data about the ministry | |
Args: | |
name (str): The Ministry's name. Mandatory field. | |
address (str): The Ministry's address | |
telephone (list of str): The Ministry's phone numbers. Up to three | |
telephone numbers are stored. | |
fax (list of str): The Ministry's fax numbers. Up to three fax | |
numbers are stored. | |
website (str): The Ministry's website link | |
''' | |
self.name = name | |
self.address = address | |
if telephone is None: | |
self.telephone = [] | |
else: | |
self.telephone = telephone | |
if fax is None: | |
self.fax = [] | |
else: | |
self.fax = fax | |
self.website = website | |
def __repr__(self): | |
'''Render Ministry object data''' | |
min_info = 'Ministry: {0}\nAddress: {1}'.format(self.name, self.address) | |
min_info += '\nTelephone numbers:' | |
for num in self.telephone: | |
min_info += '\n{0}'.format(num) | |
min_info += '\nFax numbers:' | |
for num in self.fax: | |
min_info += '\n{0}'.format(num) | |
min_info += '\nWebsite: {0}\n'.format(self.website) | |
return min_info | |
def to_csv(self, delimiter=','): | |
'''Export Ministry data to CSV format''' | |
result = '"{0}"{2}"{1}"{2}'.format(self.name, self.address, delimiter) | |
for i in xrange(3): | |
try: | |
tel_num = self.telephone[i] | |
except IndexError: | |
tel_num = '' | |
result += '{0}{1}'.format(tel_num, delimiter) | |
for i in xrange(3): | |
try: | |
fax_num = self.fax[i] | |
except IndexError: | |
fax_num = '' | |
result += '{0}{1}'.format(fax_num, delimiter) | |
result += self.website | |
return result | |
def to_json(self): | |
'''Export Ministry data to JSON format''' | |
result = OrderedDict([ | |
('name', self.name), | |
('address', self.address), | |
('telephone', self.telephone), | |
('fax', self.fax), | |
('website', self.website) | |
]) | |
return json.dumps(result, indent=4, separators=(',', ': ')) | |
def parse_tel_nums(line, keyword): | |
'''Retrieve either telephone or fax numbers from a line''' | |
text = [x for x in line.split(keyword)][1].strip() | |
return [x.strip() for x in text.split(',')] | |
def scrape_ministry(url): | |
'''Given a url, scrape the website and return a Ministry object''' | |
try: | |
page = requests.get(url, headers=HEADERS) | |
except requests.exceptions.RequestException: | |
print 'Could not reach the website' | |
raise | |
else: | |
tree = html.fromstring(page.text) | |
content = tree.xpath('//div[@class="portletMainContent"]')[0] | |
name = content.xpath('./h3/text()')[0].strip() | |
remaining_text = [x.strip().replace('\r\n', ' ') for x in content.xpath('.//p/text()')] | |
address = remaining_text[0].split('Address:')[1].strip() | |
telephone = parse_tel_nums(remaining_text[1], 'Telephone:') | |
fax = parse_tel_nums(remaining_text[2], 'Fax:') | |
try: | |
website = content.xpath('p/a/text()')[0].strip() | |
except IndexError: | |
website = '' | |
return Ministry(name, address, telephone, fax, website) | |
def scrape(url): | |
'''Return a list of Ministry objects available in the website''' | |
try: | |
page = requests.get(url, headers=HEADERS) | |
except requests.exceptions.RequestException: | |
print 'Could not reach the website' | |
raise | |
else: | |
tree = html.fromstring(page.text) | |
content = tree.xpath('//div[@class="portletMainContent"]')[0] | |
ministries = [] | |
for link in content.xpath('.//li/a[1]/@href'): | |
min_url = BASE_URL + link.strip().replace(' ', '%20').replace('\n', '') | |
ministries.append(scrape_ministry(min_url)) | |
return ministries | |
def pprint_ministries(ministries, style=None): | |
'''Format the output of the Ministry data appropriately''' | |
output = '' | |
if style == 'csv': | |
for ministry in ministries: | |
output += ministry.to_csv() + '\n' | |
elif style == 'json': | |
output += '[' | |
sublength = len(ministries) - 1 | |
for i in xrange(sublength): | |
output += ministries[i].to_json() + ',' | |
output += ministries[-1].to_json() + ']' | |
else: | |
for ministry in ministries: | |
output += str(ministry) + '\n' | |
return output | |
def main(): | |
'''Argument parser for program''' | |
parser = ArgumentParser(description='Gather all Trinidad and Tobago ' \ | |
'Government Ministries and their contact data') | |
parser.add_argument('-f', '--file', dest='filename', metavar='File', | |
help='Write output to file') | |
group = parser.add_mutually_exclusive_group() | |
group.add_argument('-c', '--csv', action='store_true', | |
help='Present data in CSV format') | |
group.add_argument('-j', '--json', action='store_true', | |
help='Present data in JSON format') | |
args = parser.parse_args() | |
ministries = scrape(URL) | |
if args.csv or args.filename[-4:] == '.csv': | |
ministry_data = pprint_ministries(ministries, 'csv') | |
elif args.json or args.filename[-5:] == '.json': | |
ministry_data = pprint_ministries(ministries, 'json') | |
else: | |
ministry_data = pprint_ministries(ministries) | |
if args.filename: | |
with open(args.filename, 'w') as min_file: | |
min_file.write(ministry_data) | |
else: | |
print ministry_data | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment