Skip to content

Instantly share code, notes, and snippets.

@philipnye
Last active February 17, 2022 19:30
Show Gist options
  • Save philipnye/c91de4cdf35a156cb0d7740461779a53 to your computer and use it in GitHub Desktop.
Save philipnye/c91de4cdf35a156cb0d7740461779a53 to your computer and use it in GitHub Desktop.
Code to scrape gov.uk articles that ministers are tagged in
# %%
# #!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Purpose
Scrape articles in which ministers are tagged and save them to SQL
Inputs
- SQL: temp.minister_govukid
- Web: gov.uk pages
Outputs
- Pickle: data/govukarticles_20150101_20211014.pkl
- SQL: reference.ukgovt.govukarticles_20150101_20211014
Parameters
- start_govuk_string: Name to start at (if None, scrapes entire list of ministers) # noqa: E501
- article_date_min: Minimum date for scraped articles (cannot be None) # noqa: E501
- article_date_max: Maximum date for scraped articles (if None, scrapes until current date) # noqa: E501
Notes
None
'''
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import sqlalchemy
from sqlalchemy import create_engine
import urllib
from datetime import datetime
import time
# %%
# CONNECT TO D/B
driver = '{ODBC Driver 17 for SQL Server}'
server = os.environ['odbc_server']
database = os.environ['odbc_database']
authentication = 'ActiveDirectoryInteractive' # Azure Active Directory - Universal with MFA support # noqa: E501
username = os.environ['odbc_username']
conn = urllib.parse.quote_plus(
'DRIVER=' + driver +
';SERVER=' + server +
';DATABASE=' + database +
';UID=' + username +
';AUTHENTICATION=' + authentication + ';'
)
engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(conn))
# %%
# READ IN LIST OF GOV.UK IDENTIFIERS
# NB: This is a distinct, and needs to only include id and govuk_string, not minister_name, so we only scrape details once of people like Zac Goldsmith, who feature under two names # noqa: E501
df_govuk_person = pd.read_sql_query(
'select distinct' + ' ' # NB: See note above
' id,' + ' '
' govuk_string' + ' '
'from temp.minister_govukid' + ' '
'where' + ' '
' govuk_string is not null', # Exclude minister for whom we don't have a gov.uk string # noqa: E501
con=engine,
index_col='id'
)
# %%
# DEFINE FUNCTIONS FOR SCRAPING
def configure_scraper(govuk_string, article_date_min, article_date_max):
page_number = 1
scrape_news_and_communications_page(
govuk_string, page_number, article_date_min, article_date_max
)
return
def scrape_news_and_communications_page(
govuk_string, page_number, article_date_min, article_date_max
):
target_url = (
target_url_stub +
target_url_newscommssnippet +
target_url_pagesnippet +
str(page_number) +
target_url_peoplesnippet +
govuk_string +
target_url_startdatesnippet +
article_date_min +
target_url_enddatesnippet +
article_date_max
)
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501
if r.status_code == 200:
soup = BeautifulSoup(r.content, features='html.parser')
results_list = soup.find('div', 'finder-results')
if results_list is not None: # This will not be the case for anyone who doesn't have any articles # noqa: E501
if results_list.ul is not None:
for result in results_list.ul.find_all('li', recursive=False): # NB: Looking for direct descendants only # noqa: E501
article_partial_url = result.find('a').get('href')
time.sleep(1)
scrape_article_page(govuk_string, article_partial_url)
if (
soup.find('nav', 'gem-c-pagination') # Pagination nav element exists # noqa: E501
):
page_number += 1
time.sleep(5)
scrape_news_and_communications_page(
govuk_string,
page_number,
article_date_min,
article_date_max
)
else:
dict = {}
dict.update({
'govuk_string': govuk_string,
'url': target_url, # Full URL - URL stub plus partial URL # noqa: E501
'status_code': r.status_code
})
status_codes_list.append(dict)
return
def scrape_article_page(govuk_string, article_partial_url):
target_url = target_url_stub + article_partial_url
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501
if r.status_code == 200:
soup = BeautifulSoup(r.content, features='html.parser')
# Grab article type and title from the title section
title_section = soup.find('div', 'gem-c-title')
if title_section.span is not None:
article_type = title_section.find('span').get_text().strip()
if title_section.h1 is not None:
title = title_section.find('h1').get_text().strip()
# Grab the tagged organisation and people and the publication date from the metadata section # noqa: E501
metadata_section = soup.find('div', 'gem-c-metadata')
if metadata_section is not None:
entities = metadata_section.find_all('a', 'govuk-link') # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links # noqa: E501
departments = []
people = []
for entity in entities:
if 'organisations' in entity.get('href'): # Link is to an organisation page on gov.uk # noqa: E501
departments.append(
entity.get('href') # Grab the gov.uk URL version of the department's name, rather than the text string # noqa: E501
.replace('/government/organisations/', '')
)
else:
people.append(
entity.get('href').replace('/government/people/', '') # Grab the gov.uk URL version of someone's name, rather than the text string # noqa: E501
)
published_line = metadata_section.find(
'dt',
'gem-c-metadata__term',
string='Published'
)
published_date = published_line.findNext('dd').get_text().strip() # There isn't a better way of uniquely selecting this line, other than going via it's sibling # noqa: E501
published_date = datetime.strftime( # Put date into standard format # noqa: E501
datetime.strptime(
published_date,
'%d %B %Y'
),
'%Y-%m-%d'
)
# For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum) # noqa: E501
important_section = soup.find('div', 'app-c-important-metadata')
if important_section is not None:
important_section.find(
'dt',
'app-c-important-metadata__term',
string='Delivered on: '
)
published_date = (
published_line
.findNext('dd')
.get_text()
)
# Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others) # noqa: E501
# These appear in brackets, after the date followed by a space
pos = published_date.find('(')
if pos != -1: # If bracket is found
published_date = published_date[:pos - 1]
published_date = datetime.strftime( # Put date into standard format # noqa: E501
datetime.strptime(
published_date,
'%d %B %Y'
),
'%Y-%m-%d'
)
dict = {}
dict.update({
'govuk_string': govuk_string,
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
'article_type': article_type,
'title': title,
'date': published_date,
'departments': departments,
'people': people
})
rows_list.append(dict)
else:
dict = {}
dict.update({
'govuk_string': govuk_string,
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
'status_code': r.status_code
})
status_codes_list.append(dict)
return
# # %%
# SCRAPE DATA AND SAVE TO DATAFRAME
# Set variables
target_url_stub = 'https://www.gov.uk'
target_url_newscommssnippet = '/search/news-and-communications?'
target_url_pagesnippet = 'page='
target_url_peoplesnippet = '&people='
target_url_startdatesnippet = '&public_timestamp%5Bfrom%5D=' # Includes [ and ] percent-encoded # noqa: E501
target_url_enddatesnippet = '&public_timestamp%5Bto%5D=' # Ditto
# Set min and max dates to use in pulling gov.uk article search results. article_date_min cannot be None. If article_date_max is None, scrapes until current date # noqa: E501
article_date_min = '2015-01-01'
article_date_max = '2021-12-31'
# Allow an optional starting value to be selected. If start_govuk_string is None, the entirety of df_govuk_person will be used as the input # noqa: E501
start_govuk_string = None
if start_govuk_string:
idx = np.where(
df_govuk_person['minister_name'] == start_govuk_string
)[0][0]
df_govuk_person_to_scrape = df_govuk_person[idx:]
else:
df_govuk_person_to_scrape = df_govuk_person
# Set up data structures used to store results
# This saves rows to a list of dictionaries then saves this list of dictionaries to a dataframe # noqa: E501
# This has performance benefits over adding individual rows to a dataframe (ref: https://stackoverflow.com/questions/10715965/create-a-pandas-dataframe-by-appending-one-row-at-a-time) # noqa: E501
# NB: Only non-200 status codes are saved
rows_list = []
status_codes_list = []
# Run scraper
# NB: This could be made more efficient by doing the scraping in two stages: gathering all of the article URLs to be hit and removing duplicates, then hitting those URLs. We will hit some URLs more than once as things stand, where an article appears in the news and communications search results for more than one person. Duplicates are only avoided because we keep the govuk_string that led us to scrape an article rather than the people tagged in the article in the data we save to SQL # noqa: E501
result = [
configure_scraper(x, article_date_min, article_date_max)
for x in df_govuk_person_to_scrape['govuk_string']
]
# Save results to dataframe, and correct data type of one column
df_govkukarticles = pd.DataFrame(rows_list, )
df_govkukarticles['date'] = pd.to_datetime(df_govkukarticles['date'])
df_status_codes = pd.DataFrame(status_codes_list, )
# %%
# SET DF DISPLAY OPTIONS
pd.options.display.max_rows = None
# %%
# PRINT STATUS CODES FOR MANUAL QA
df_status_codes
# %%
# PRINT RESULTS FOR MANUAL QA
df_govkukarticles
# %%
# SAVE SCRAPED DATA TO PICKLE
df_govkukarticles.to_pickle('data/govukarticles_20150101_20211231.pkl')
# %%
# TURN GOV.UK ARTICLES DATA INTO A TIDY DATASET
# Explode departments column, which contains a list, into separate rows and rename column # noqa: E501
df_govkukarticles_tidy = (
df_govkukarticles.explode('departments')
.reset_index(drop=True) # Without this, newly created rows are given duplicate indexes # noqa: E501
)
df_govkukarticles_tidy.rename(
columns={
'departments': 'department'
},
inplace=True
)
# Drop people column
df_govkukarticles_tidy = df_govkukarticles_tidy.drop('people', axis=1)
# %%
# SAVE RESULTS TO SQL
pd.io.sql.execute(
'drop table if exists reference.[ukgovt.govukarticles_20150101_20211231]',
con=engine
)
df_govkukarticles_tidy.to_sql(
'ukgovt.govukarticles_20150101_20211231',
schema='reference',
con=engine,
dtype={
'govuk_string': sqlalchemy.types.NVARCHAR(length=255),
'url': sqlalchemy.types.NVARCHAR(length=255),
'article_type': sqlalchemy.types.NVARCHAR(length=255),
'title': sqlalchemy.types.NVARCHAR(length=255),
'date': sqlalchemy.types.DATE,
'department': sqlalchemy.types.NVARCHAR(length=255)
},
index=False
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment