Last active
February 17, 2022 19:30
-
-
Save philipnye/c91de4cdf35a156cb0d7740461779a53 to your computer and use it in GitHub Desktop.
Code to scrape gov.uk articles that ministers are tagged in
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
# #!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
''' | |
Purpose | |
Scrape articles in which ministers are tagged and save them to SQL | |
Inputs | |
- SQL: temp.minister_govukid | |
- Web: gov.uk pages | |
Outputs | |
- Pickle: data/govukarticles_20150101_20211014.pkl | |
- SQL: reference.ukgovt.govukarticles_20150101_20211014 | |
Parameters | |
- start_govuk_string: Name to start at (if None, scrapes entire list of ministers) # noqa: E501 | |
- article_date_min: Minimum date for scraped articles (cannot be None) # noqa: E501 | |
- article_date_max: Maximum date for scraped articles (if None, scrapes until current date) # noqa: E501 | |
Notes | |
None | |
''' | |
import numpy as np | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import sqlalchemy | |
from sqlalchemy import create_engine | |
import urllib | |
from datetime import datetime | |
import time | |
# %% | |
# CONNECT TO D/B | |
driver = '{ODBC Driver 17 for SQL Server}' | |
server = os.environ['odbc_server'] | |
database = os.environ['odbc_database'] | |
authentication = 'ActiveDirectoryInteractive' # Azure Active Directory - Universal with MFA support # noqa: E501 | |
username = os.environ['odbc_username'] | |
conn = urllib.parse.quote_plus( | |
'DRIVER=' + driver + | |
';SERVER=' + server + | |
';DATABASE=' + database + | |
';UID=' + username + | |
';AUTHENTICATION=' + authentication + ';' | |
) | |
engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(conn)) | |
# %% | |
# READ IN LIST OF GOV.UK IDENTIFIERS | |
# NB: This is a distinct, and needs to only include id and govuk_string, not minister_name, so we only scrape details once of people like Zac Goldsmith, who feature under two names # noqa: E501 | |
df_govuk_person = pd.read_sql_query( | |
'select distinct' + ' ' # NB: See note above | |
' id,' + ' ' | |
' govuk_string' + ' ' | |
'from temp.minister_govukid' + ' ' | |
'where' + ' ' | |
' govuk_string is not null', # Exclude minister for whom we don't have a gov.uk string # noqa: E501 | |
con=engine, | |
index_col='id' | |
) | |
# %% | |
# DEFINE FUNCTIONS FOR SCRAPING | |
def configure_scraper(govuk_string, article_date_min, article_date_max): | |
page_number = 1 | |
scrape_news_and_communications_page( | |
govuk_string, page_number, article_date_min, article_date_max | |
) | |
return | |
def scrape_news_and_communications_page( | |
govuk_string, page_number, article_date_min, article_date_max | |
): | |
target_url = ( | |
target_url_stub + | |
target_url_newscommssnippet + | |
target_url_pagesnippet + | |
str(page_number) + | |
target_url_peoplesnippet + | |
govuk_string + | |
target_url_startdatesnippet + | |
article_date_min + | |
target_url_enddatesnippet + | |
article_date_max | |
) | |
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501 | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.content, features='html.parser') | |
results_list = soup.find('div', 'finder-results') | |
if results_list is not None: # This will not be the case for anyone who doesn't have any articles # noqa: E501 | |
if results_list.ul is not None: | |
for result in results_list.ul.find_all('li', recursive=False): # NB: Looking for direct descendants only # noqa: E501 | |
article_partial_url = result.find('a').get('href') | |
time.sleep(1) | |
scrape_article_page(govuk_string, article_partial_url) | |
if ( | |
soup.find('nav', 'gem-c-pagination') # Pagination nav element exists # noqa: E501 | |
): | |
page_number += 1 | |
time.sleep(5) | |
scrape_news_and_communications_page( | |
govuk_string, | |
page_number, | |
article_date_min, | |
article_date_max | |
) | |
else: | |
dict = {} | |
dict.update({ | |
'govuk_string': govuk_string, | |
'url': target_url, # Full URL - URL stub plus partial URL # noqa: E501 | |
'status_code': r.status_code | |
}) | |
status_codes_list.append(dict) | |
return | |
def scrape_article_page(govuk_string, article_partial_url): | |
target_url = target_url_stub + article_partial_url | |
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501 | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.content, features='html.parser') | |
# Grab article type and title from the title section | |
title_section = soup.find('div', 'gem-c-title') | |
if title_section.span is not None: | |
article_type = title_section.find('span').get_text().strip() | |
if title_section.h1 is not None: | |
title = title_section.find('h1').get_text().strip() | |
# Grab the tagged organisation and people and the publication date from the metadata section # noqa: E501 | |
metadata_section = soup.find('div', 'gem-c-metadata') | |
if metadata_section is not None: | |
entities = metadata_section.find_all('a', 'govuk-link') # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links # noqa: E501 | |
departments = [] | |
people = [] | |
for entity in entities: | |
if 'organisations' in entity.get('href'): # Link is to an organisation page on gov.uk # noqa: E501 | |
departments.append( | |
entity.get('href') # Grab the gov.uk URL version of the department's name, rather than the text string # noqa: E501 | |
.replace('/government/organisations/', '') | |
) | |
else: | |
people.append( | |
entity.get('href').replace('/government/people/', '') # Grab the gov.uk URL version of someone's name, rather than the text string # noqa: E501 | |
) | |
published_line = metadata_section.find( | |
'dt', | |
'gem-c-metadata__term', | |
string='Published' | |
) | |
published_date = published_line.findNext('dd').get_text().strip() # There isn't a better way of uniquely selecting this line, other than going via it's sibling # noqa: E501 | |
published_date = datetime.strftime( # Put date into standard format # noqa: E501 | |
datetime.strptime( | |
published_date, | |
'%d %B %Y' | |
), | |
'%Y-%m-%d' | |
) | |
# For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum) # noqa: E501 | |
important_section = soup.find('div', 'app-c-important-metadata') | |
if important_section is not None: | |
important_section.find( | |
'dt', | |
'app-c-important-metadata__term', | |
string='Delivered on: ' | |
) | |
published_date = ( | |
published_line | |
.findNext('dd') | |
.get_text() | |
) | |
# Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others) # noqa: E501 | |
# These appear in brackets, after the date followed by a space | |
pos = published_date.find('(') | |
if pos != -1: # If bracket is found | |
published_date = published_date[:pos - 1] | |
published_date = datetime.strftime( # Put date into standard format # noqa: E501 | |
datetime.strptime( | |
published_date, | |
'%d %B %Y' | |
), | |
'%Y-%m-%d' | |
) | |
dict = {} | |
dict.update({ | |
'govuk_string': govuk_string, | |
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501 | |
'article_type': article_type, | |
'title': title, | |
'date': published_date, | |
'departments': departments, | |
'people': people | |
}) | |
rows_list.append(dict) | |
else: | |
dict = {} | |
dict.update({ | |
'govuk_string': govuk_string, | |
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501 | |
'status_code': r.status_code | |
}) | |
status_codes_list.append(dict) | |
return | |
# # %% | |
# SCRAPE DATA AND SAVE TO DATAFRAME | |
# Set variables | |
target_url_stub = 'https://www.gov.uk' | |
target_url_newscommssnippet = '/search/news-and-communications?' | |
target_url_pagesnippet = 'page=' | |
target_url_peoplesnippet = '&people=' | |
target_url_startdatesnippet = '&public_timestamp%5Bfrom%5D=' # Includes [ and ] percent-encoded # noqa: E501 | |
target_url_enddatesnippet = '&public_timestamp%5Bto%5D=' # Ditto | |
# Set min and max dates to use in pulling gov.uk article search results. article_date_min cannot be None. If article_date_max is None, scrapes until current date # noqa: E501 | |
article_date_min = '2015-01-01' | |
article_date_max = '2021-12-31' | |
# Allow an optional starting value to be selected. If start_govuk_string is None, the entirety of df_govuk_person will be used as the input # noqa: E501 | |
start_govuk_string = None | |
if start_govuk_string: | |
idx = np.where( | |
df_govuk_person['minister_name'] == start_govuk_string | |
)[0][0] | |
df_govuk_person_to_scrape = df_govuk_person[idx:] | |
else: | |
df_govuk_person_to_scrape = df_govuk_person | |
# Set up data structures used to store results | |
# This saves rows to a list of dictionaries then saves this list of dictionaries to a dataframe # noqa: E501 | |
# This has performance benefits over adding individual rows to a dataframe (ref: https://stackoverflow.com/questions/10715965/create-a-pandas-dataframe-by-appending-one-row-at-a-time) # noqa: E501 | |
# NB: Only non-200 status codes are saved | |
rows_list = [] | |
status_codes_list = [] | |
# Run scraper | |
# NB: This could be made more efficient by doing the scraping in two stages: gathering all of the article URLs to be hit and removing duplicates, then hitting those URLs. We will hit some URLs more than once as things stand, where an article appears in the news and communications search results for more than one person. Duplicates are only avoided because we keep the govuk_string that led us to scrape an article rather than the people tagged in the article in the data we save to SQL # noqa: E501 | |
result = [ | |
configure_scraper(x, article_date_min, article_date_max) | |
for x in df_govuk_person_to_scrape['govuk_string'] | |
] | |
# Save results to dataframe, and correct data type of one column | |
df_govkukarticles = pd.DataFrame(rows_list, ) | |
df_govkukarticles['date'] = pd.to_datetime(df_govkukarticles['date']) | |
df_status_codes = pd.DataFrame(status_codes_list, ) | |
# %% | |
# SET DF DISPLAY OPTIONS | |
pd.options.display.max_rows = None | |
# %% | |
# PRINT STATUS CODES FOR MANUAL QA | |
df_status_codes | |
# %% | |
# PRINT RESULTS FOR MANUAL QA | |
df_govkukarticles | |
# %% | |
# SAVE SCRAPED DATA TO PICKLE | |
df_govkukarticles.to_pickle('data/govukarticles_20150101_20211231.pkl') | |
# %% | |
# TURN GOV.UK ARTICLES DATA INTO A TIDY DATASET | |
# Explode departments column, which contains a list, into separate rows and rename column # noqa: E501 | |
df_govkukarticles_tidy = ( | |
df_govkukarticles.explode('departments') | |
.reset_index(drop=True) # Without this, newly created rows are given duplicate indexes # noqa: E501 | |
) | |
df_govkukarticles_tidy.rename( | |
columns={ | |
'departments': 'department' | |
}, | |
inplace=True | |
) | |
# Drop people column | |
df_govkukarticles_tidy = df_govkukarticles_tidy.drop('people', axis=1) | |
# %% | |
# SAVE RESULTS TO SQL | |
pd.io.sql.execute( | |
'drop table if exists reference.[ukgovt.govukarticles_20150101_20211231]', | |
con=engine | |
) | |
df_govkukarticles_tidy.to_sql( | |
'ukgovt.govukarticles_20150101_20211231', | |
schema='reference', | |
con=engine, | |
dtype={ | |
'govuk_string': sqlalchemy.types.NVARCHAR(length=255), | |
'url': sqlalchemy.types.NVARCHAR(length=255), | |
'article_type': sqlalchemy.types.NVARCHAR(length=255), | |
'title': sqlalchemy.types.NVARCHAR(length=255), | |
'date': sqlalchemy.types.DATE, | |
'department': sqlalchemy.types.NVARCHAR(length=255) | |
}, | |
index=False | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment