Skip to content

Instantly share code, notes, and snippets.

@ferblape
Created March 3, 2018 12:05
Show Gist options
  • Save ferblape/4d2af60c534f98e630f67f1201ee2955 to your computer and use it in GitHub Desktop.
Save ferblape/4d2af60c534f98e630f67f1201ee2955 to your computer and use it in GitHub Desktop.
Python scraper
import requests
from lxml import html
from pprint import pprint
import csv
# Base URL is the host of the page
BASE_URL = 'https://investinginyourfuture.gov.mt'
# Projects are fetch from the paginated list
PAGINATION_URL = 'https://investinginyourfuture.gov.mt/ajax/loadProjects.ashx?page='
def get_text(html_node):
if html_node is not None:
return html_node.text
def scrape_project(url):
project_data = []
url = BASE_URL + url
res = requests.get(url)
doc = html.fromstring(res.content)
# Code
project_data.append(get_text(doc.find('.//span[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectRefCode"]')))
# Title
project_data.append(get_text(doc.find('.//span[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectTitle"]')))
# Project Cost
project_data.append(get_text(doc.find('.//span[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectCostBeneficiaryItem_divCostValue"]')))
# Beneficiary
project_data.append(get_text(doc.find('.//span[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectCostBeneficiaryItem_divBeneficiaryValue"]')))
# Line Ministry
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdLineMinistry"]')))
# Start Date
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdStartDate"]')))
# End Date
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdEndDate"]')))
# Non Technical Short Summary Of Project
project_data.append(get_text(doc.find('.//div[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_divNonTechnicalShortSummaryContent"]/p')))
# Operational Programme
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdOperationalProgramme"]')))
# Fund
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdFund"]')))
# Operational Objective
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdOperationalObjective"]')))
# Priority Axis
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdPriorityAxis"]')))
# Focus Area Of Intervention
project_data.append(get_text(doc.find('.//td[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_tdFocusAreaOfIntervention1"]')))
# Project Objectives
project_data.append(get_text(doc.find('.//div[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_divProjectObjectives"]/p')))
# Project Results
project_data.append(get_text(doc.find('.//div[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_divProjectResults"]/p')))
# Project Purpose
project_data.append(get_text(doc.find('.//div[@id="mainPlaceHolder_coreContentPlaceHolder_mainContentPlaceHolder_projectDetails_divProjectPurpose"]/p')))
return project_data
def scrape():
headers = [ 'Code', 'Title', 'Project Cost', 'Beneficiary', 'Line Ministry', 'Start Date', 'End Date', 'Non Technical Short Summary Of Project', 'Operational Programme', 'Fund', 'Operational Objective',
'Priority Axis', 'Focus Area Of Intervention', 'Project Objectives', 'Project Results', 'Project Purpose' ]
with open('data.csv', 'w', newline='') as f:
writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(headers)
# page = 28
page = 2
while True:
res = requests.get(PAGINATION_URL + str(page))
if 'Content-Length' in res.headers and res.headers['Content-Length'] == '0':
print("Exiting...")
break
doc = html.fromstring(res.content)
for link in doc.findall('.//div[@class="project-listing-item-title"]/a'):
project_data_row = scrape_project(link.get('href'))
writer.writerow(project_data_row)
page = page + 1
print("\n")
if __name__ == '__main__':
scrape()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment