philipnye/govukarticles_scrape_articles.py

## govukarticles_scrape_articles.py
# %%
# #!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
    Purpose
        Scrape articles in which ministers are tagged and save them to SQL
    Inputs
        - SQL: temp.minister_govukid
        - Web: gov.uk pages
    Outputs
        - Pickle: data/govukarticles_20150101_20211014.pkl
        - SQL: reference.ukgovt.govukarticles_20150101_20211014
    Parameters
        - start_govuk_string: Name to start at (if None, scrapes entire list of ministers)     # noqa: E501
        - article_date_min: Minimum date for scraped articles (cannot be None)     # noqa: E501
        - article_date_max: Maximum date for scraped articles (if None, scrapes until current date)     # noqa: E501
    Notes
        None
'''

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import sqlalchemy
from sqlalchemy import create_engine
import urllib
from datetime import datetime
import time

# %%
# CONNECT TO D/B
driver = '{ODBC Driver 17 for SQL Server}'
server = os.environ['odbc_server']
database = os.environ['odbc_database']
authentication = 'ActiveDirectoryInteractive'       # Azure Active Directory - Universal with MFA support      # noqa: E501
username = os.environ['odbc_username']

conn = urllib.parse.quote_plus(
    'DRIVER=' + driver +
    ';SERVER=' + server +
    ';DATABASE=' + database +
    ';UID=' + username +
    ';AUTHENTICATION=' + authentication + ';'
)
engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(conn))

# %%
# READ IN LIST OF GOV.UK IDENTIFIERS
# NB: This is a distinct, and needs to only include id and govuk_string, not minister_name, so we only scrape details once of people like Zac Goldsmith, who feature under two names       # noqa: E501
df_govuk_person = pd.read_sql_query(
    'select distinct' + ' '     # NB: See note above
    '   id,' + ' '
    '   govuk_string' + ' '
    'from temp.minister_govukid' + ' '
    'where' + ' '
    '    govuk_string is not null',     # Exclude minister for whom we don't have a gov.uk string       # noqa: E501
    con=engine,
    index_col='id'
)


# %%
# DEFINE FUNCTIONS FOR SCRAPING
def configure_scraper(govuk_string, article_date_min, article_date_max):
    page_number = 1

    scrape_news_and_communications_page(
        govuk_string, page_number, article_date_min, article_date_max
    )

    return


def scrape_news_and_communications_page(
    govuk_string, page_number, article_date_min, article_date_max
):
    target_url = (
        target_url_stub +
        target_url_newscommssnippet +
        target_url_pagesnippet +
        str(page_number) +
        target_url_peoplesnippet +
        govuk_string +
        target_url_startdatesnippet +
        article_date_min +
        target_url_enddatesnippet +
        article_date_max
    )

    r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'})     # Gov.uk might require headers on the request (unconfirmed)     # noqa: E501

    if r.status_code == 200:
        soup = BeautifulSoup(r.content, features='html.parser')

        results_list = soup.find('div', 'finder-results')

        if results_list is not None:     # This will not be the case for anyone who doesn't have any articles        # noqa: E501
            if results_list.ul is not None:
                for result in results_list.ul.find_all('li', recursive=False):     # NB: Looking for direct descendants only       # noqa: E501
                    article_partial_url = result.find('a').get('href')
                    time.sleep(1)
                    scrape_article_page(govuk_string, article_partial_url)

                if (
                    soup.find('nav', 'gem-c-pagination')        # Pagination nav element exists     # noqa: E501
                ):
                    page_number += 1
                    time.sleep(5)
                    scrape_news_and_communications_page(
                        govuk_string,
                        page_number,
                        article_date_min,
                        article_date_max
                    )
    else:
        dict = {}
        dict.update({
            'govuk_string': govuk_string,
            'url': target_url,      # Full URL - URL stub plus partial URL      # noqa: E501
            'status_code': r.status_code
        })
        status_codes_list.append(dict)

    return


def scrape_article_page(govuk_string, article_partial_url):
    target_url = target_url_stub + article_partial_url

    r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'})     # Gov.uk might require headers on the request (unconfirmed)     # noqa: E501

    if r.status_code == 200:
        soup = BeautifulSoup(r.content, features='html.parser')

        # Grab article type and title from the title section
        title_section = soup.find('div', 'gem-c-title')
        if title_section.span is not None:
            article_type = title_section.find('span').get_text().strip()
        if title_section.h1 is not None:
            title = title_section.find('h1').get_text().strip()

        # Grab the tagged organisation and people and the publication date from the metadata section        # noqa: E501
        metadata_section = soup.find('div', 'gem-c-metadata')
        if metadata_section is not None:
            entities = metadata_section.find_all('a', 'govuk-link')       # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links       # noqa: E501
            departments = []
            people = []
            for entity in entities:
                if 'organisations' in entity.get('href'):       # Link is to an organisation page on gov.uk     # noqa: E501
                    departments.append(
                        entity.get('href')       # Grab the gov.uk URL version of the department's name, rather than the text string        # noqa: E501
                        .replace('/government/organisations/', '')
                    )
                else:
                    people.append(
                        entity.get('href').replace('/government/people/', '')        # Grab the gov.uk URL version of someone's name, rather than the text string        # noqa: E501
                    )

            published_line = metadata_section.find(
                'dt',
                'gem-c-metadata__term',
                string='Published'
            )
            published_date = published_line.findNext('dd').get_text().strip()     # There isn't a better way of uniquely selecting this line, other than going via it's sibling       # noqa: E501
            published_date = datetime.strftime(     # Put date into standard format     # noqa: E501
                datetime.strptime(
                    published_date,
                    '%d %B %Y'
                ),
                '%Y-%m-%d'
            )

        # For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum)       # noqa: E501
        important_section = soup.find('div', 'app-c-important-metadata')

        if important_section is not None:
            important_section.find(
                'dt',
                'app-c-important-metadata__term',
                string='Delivered on: '
            )
            published_date = (
                published_line
                .findNext('dd')
                .get_text()
            )

            # Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others)     # noqa: E501
            # These appear in brackets, after the date followed by a space
            pos = published_date.find('(')

            if pos != -1:        # If bracket is found
                published_date = published_date[:pos - 1]

            published_date = datetime.strftime(     # Put date into standard format     # noqa: E501
                datetime.strptime(
                    published_date,
                    '%d %B %Y'
                ),
                '%Y-%m-%d'
            )

        dict = {}
        dict.update({
            'govuk_string': govuk_string,
            'url': target_url,      # Full article URL - URL stub plus article partial URL      # noqa: E501
            'article_type': article_type,
            'title': title,
            'date': published_date,
            'departments': departments,
            'people': people
        })
        rows_list.append(dict)
    else:
        dict = {}
        dict.update({
            'govuk_string': govuk_string,
            'url': target_url,      # Full article URL - URL stub plus article partial URL      # noqa: E501
            'status_code': r.status_code
        })
        status_codes_list.append(dict)

    return


# # %%
# SCRAPE DATA AND SAVE TO DATAFRAME
# Set variables
target_url_stub = 'https://www.gov.uk'
target_url_newscommssnippet = '/search/news-and-communications?'
target_url_pagesnippet = 'page='
target_url_peoplesnippet = '&people='
target_url_startdatesnippet = '&public_timestamp%5Bfrom%5D='     # Includes [ and ] percent-encoded     # noqa: E501
target_url_enddatesnippet = '&public_timestamp%5Bto%5D='        # Ditto

# Set min and max dates to use in pulling gov.uk article search results. article_date_min cannot be None. If article_date_max is None, scrapes until current date       # noqa: E501
article_date_min = '2015-01-01'
article_date_max = '2021-12-31'

# Allow an optional starting value to be selected. If start_govuk_string is None, the entirety of df_govuk_person will be used as the input       # noqa: E501
start_govuk_string = None

if start_govuk_string:
    idx = np.where(
        df_govuk_person['minister_name'] == start_govuk_string
    )[0][0]
    df_govuk_person_to_scrape = df_govuk_person[idx:]
else:
    df_govuk_person_to_scrape = df_govuk_person

# Set up data structures used to store results
# This saves rows to a list of dictionaries then saves this list of dictionaries to a dataframe     # noqa: E501
# This has performance benefits over adding individual rows to a dataframe (ref: https://stackoverflow.com/questions/10715965/create-a-pandas-dataframe-by-appending-one-row-at-a-time)     # noqa: E501
# NB: Only non-200 status codes are saved
rows_list = []
status_codes_list = []

# Run scraper
# NB: This could be made more efficient by doing the scraping in two stages: gathering all of the article URLs to be hit and removing duplicates, then hitting those URLs. We will hit some URLs more than once as things stand, where an article appears in the news and communications search results for more than one person. Duplicates are only avoided because we keep the govuk_string that led us to scrape an article rather than the people tagged in the article in the data we save to SQL     # noqa: E501
result = [
    configure_scraper(x, article_date_min, article_date_max)
    for x in df_govuk_person_to_scrape['govuk_string']
]

# Save results to dataframe, and correct data type of one column
df_govkukarticles = pd.DataFrame(rows_list, )
df_govkukarticles['date'] = pd.to_datetime(df_govkukarticles['date'])

df_status_codes = pd.DataFrame(status_codes_list, )

# %%
# SET DF DISPLAY OPTIONS
pd.options.display.max_rows = None

# %%
# PRINT STATUS CODES FOR MANUAL QA
df_status_codes

# %%
# PRINT RESULTS FOR MANUAL QA
df_govkukarticles

# %%
# SAVE SCRAPED DATA TO PICKLE
df_govkukarticles.to_pickle('data/govukarticles_20150101_20211231.pkl')

# %%
# TURN GOV.UK ARTICLES DATA INTO A TIDY DATASET
# Explode departments column, which contains a list, into separate rows and rename column       # noqa: E501
df_govkukarticles_tidy = (
    df_govkukarticles.explode('departments')
    .reset_index(drop=True)     # Without this, newly created rows are given duplicate indexes      # noqa: E501
)

df_govkukarticles_tidy.rename(
    columns={
        'departments': 'department'
    },
    inplace=True
)

# Drop people column
df_govkukarticles_tidy = df_govkukarticles_tidy.drop('people', axis=1)

# %%
# SAVE RESULTS TO SQL
pd.io.sql.execute(
    'drop table if exists reference.[ukgovt.govukarticles_20150101_20211231]',
    con=engine
)
df_govkukarticles_tidy.to_sql(
    'ukgovt.govukarticles_20150101_20211231',
    schema='reference',
    con=engine,
    dtype={
        'govuk_string': sqlalchemy.types.NVARCHAR(length=255),
        'url': sqlalchemy.types.NVARCHAR(length=255),
        'article_type': sqlalchemy.types.NVARCHAR(length=255),
        'title': sqlalchemy.types.NVARCHAR(length=255),
        'date': sqlalchemy.types.DATE,
        'department': sqlalchemy.types.NVARCHAR(length=255)
    },
    index=False
)
	# %%
	# #!/usr/bin/env python
	# -- coding: utf-8 --

	'''
	Purpose
	Scrape articles in which ministers are tagged and save them to SQL
	Inputs
	- SQL: temp.minister_govukid
	- Web: gov.uk pages
	Outputs
	- Pickle: data/govukarticles_20150101_20211014.pkl
	- SQL: reference.ukgovt.govukarticles_20150101_20211014
	Parameters
	- start_govuk_string: Name to start at (if None, scrapes entire list of ministers) # noqa: E501
	- article_date_min: Minimum date for scraped articles (cannot be None) # noqa: E501
	- article_date_max: Maximum date for scraped articles (if None, scrapes until current date) # noqa: E501
	Notes
	None
	'''

	import numpy as np
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	import os
	import sqlalchemy
	from sqlalchemy import create_engine
	import urllib
	from datetime import datetime
	import time

	# %%
	# CONNECT TO D/B
	driver = '{ODBC Driver 17 for SQL Server}'
	server = os.environ['odbc_server']
	database = os.environ['odbc_database']
	authentication = 'ActiveDirectoryInteractive' # Azure Active Directory - Universal with MFA support # noqa: E501
	username = os.environ['odbc_username']

	conn = urllib.parse.quote_plus(
	'DRIVER=' + driver +
	';SERVER=' + server +
	';DATABASE=' + database +
	';UID=' + username +
	';AUTHENTICATION=' + authentication + ';'
	)
	engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(conn))

	# %%
	# READ IN LIST OF GOV.UK IDENTIFIERS
	# NB: This is a distinct, and needs to only include id and govuk_string, not minister_name, so we only scrape details once of people like Zac Goldsmith, who feature under two names # noqa: E501
	df_govuk_person = pd.read_sql_query(
	'select distinct' + ' ' # NB: See note above
	' id,' + ' '
	' govuk_string' + ' '
	'from temp.minister_govukid' + ' '
	'where' + ' '
	' govuk_string is not null', # Exclude minister for whom we don't have a gov.uk string # noqa: E501
	con=engine,
	index_col='id'
	)


	# %%
	# DEFINE FUNCTIONS FOR SCRAPING
	def configure_scraper(govuk_string, article_date_min, article_date_max):
	page_number = 1

	scrape_news_and_communications_page(
	govuk_string, page_number, article_date_min, article_date_max
	)

	return


	def scrape_news_and_communications_page(
	govuk_string, page_number, article_date_min, article_date_max
	):
	target_url = (
	target_url_stub +
	target_url_newscommssnippet +
	target_url_pagesnippet +
	str(page_number) +
	target_url_peoplesnippet +
	govuk_string +
	target_url_startdatesnippet +
	article_date_min +
	target_url_enddatesnippet +
	article_date_max
	)

	r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501

	if r.status_code == 200:
	soup = BeautifulSoup(r.content, features='html.parser')

	results_list = soup.find('div', 'finder-results')

	if results_list is not None: # This will not be the case for anyone who doesn't have any articles # noqa: E501
	if results_list.ul is not None:
	for result in results_list.ul.find_all('li', recursive=False): # NB: Looking for direct descendants only # noqa: E501
	article_partial_url = result.find('a').get('href')
	time.sleep(1)
	scrape_article_page(govuk_string, article_partial_url)

	if (
	soup.find('nav', 'gem-c-pagination') # Pagination nav element exists # noqa: E501
	):
	page_number += 1
	time.sleep(5)
	scrape_news_and_communications_page(
	govuk_string,
	page_number,
	article_date_min,
	article_date_max
	)
	else:
	dict = {}
	dict.update({
	'govuk_string': govuk_string,
	'url': target_url, # Full URL - URL stub plus partial URL # noqa: E501
	'status_code': r.status_code
	})
	status_codes_list.append(dict)

	return


	def scrape_article_page(govuk_string, article_partial_url):
	target_url = target_url_stub + article_partial_url

	r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501

	if r.status_code == 200:
	soup = BeautifulSoup(r.content, features='html.parser')

	# Grab article type and title from the title section
	title_section = soup.find('div', 'gem-c-title')
	if title_section.span is not None:
	article_type = title_section.find('span').get_text().strip()
	if title_section.h1 is not None:
	title = title_section.find('h1').get_text().strip()

	# Grab the tagged organisation and people and the publication date from the metadata section # noqa: E501
	metadata_section = soup.find('div', 'gem-c-metadata')
	if metadata_section is not None:
	entities = metadata_section.find_all('a', 'govuk-link') # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links # noqa: E501
	departments = []
	people = []
	for entity in entities:
	if 'organisations' in entity.get('href'): # Link is to an organisation page on gov.uk # noqa: E501
	departments.append(
	entity.get('href') # Grab the gov.uk URL version of the department's name, rather than the text string # noqa: E501
	.replace('/government/organisations/', '')
	)
	else:
	people.append(
	entity.get('href').replace('/government/people/', '') # Grab the gov.uk URL version of someone's name, rather than the text string # noqa: E501
	)

	published_line = metadata_section.find(
	'dt',
	'gem-c-metadata__term',
	string='Published'
	)
	published_date = published_line.findNext('dd').get_text().strip() # There isn't a better way of uniquely selecting this line, other than going via it's sibling # noqa: E501
	published_date = datetime.strftime( # Put date into standard format # noqa: E501
	datetime.strptime(
	published_date,
	'%d %B %Y'
	),
	'%Y-%m-%d'
	)

	# For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum) # noqa: E501
	important_section = soup.find('div', 'app-c-important-metadata')

	if important_section is not None:
	important_section.find(
	'dt',
	'app-c-important-metadata__term',
	string='Delivered on: '
	)
	published_date = (
	published_line
	.findNext('dd')
	.get_text()
	)

	# Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others) # noqa: E501
	# These appear in brackets, after the date followed by a space
	pos = published_date.find('(')

	if pos != -1: # If bracket is found
	published_date = published_date[:pos - 1]

	published_date = datetime.strftime( # Put date into standard format # noqa: E501
	datetime.strptime(
	published_date,
	'%d %B %Y'
	),
	'%Y-%m-%d'
	)

	dict = {}
	dict.update({
	'govuk_string': govuk_string,
	'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
	'article_type': article_type,
	'title': title,
	'date': published_date,
	'departments': departments,
	'people': people
	})
	rows_list.append(dict)
	else:
	dict = {}
	dict.update({
	'govuk_string': govuk_string,
	'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
	'status_code': r.status_code
	})
	status_codes_list.append(dict)

	return


	# # %%
	# SCRAPE DATA AND SAVE TO DATAFRAME
	# Set variables
	target_url_stub = 'https://www.gov.uk'
	target_url_newscommssnippet = '/search/news-and-communications?'
	target_url_pagesnippet = 'page='
	target_url_peoplesnippet = '&people='
	target_url_startdatesnippet = '&public_timestamp%5Bfrom%5D=' # Includes [ and ] percent-encoded # noqa: E501
	target_url_enddatesnippet = '&public_timestamp%5Bto%5D=' # Ditto

	# Set min and max dates to use in pulling gov.uk article search results. article_date_min cannot be None. If article_date_max is None, scrapes until current date # noqa: E501
	article_date_min = '2015-01-01'
	article_date_max = '2021-12-31'

	# Allow an optional starting value to be selected. If start_govuk_string is None, the entirety of df_govuk_person will be used as the input # noqa: E501
	start_govuk_string = None

	if start_govuk_string:
	idx = np.where(
	df_govuk_person['minister_name'] == start_govuk_string
	)[0][0]
	df_govuk_person_to_scrape = df_govuk_person[idx:]
	else:
	df_govuk_person_to_scrape = df_govuk_person

	# Set up data structures used to store results
	# This saves rows to a list of dictionaries then saves this list of dictionaries to a dataframe # noqa: E501
	# This has performance benefits over adding individual rows to a dataframe (ref: https://stackoverflow.com/questions/10715965/create-a-pandas-dataframe-by-appending-one-row-at-a-time) # noqa: E501
	# NB: Only non-200 status codes are saved
	rows_list = []
	status_codes_list = []

	# Run scraper
	# NB: This could be made more efficient by doing the scraping in two stages: gathering all of the article URLs to be hit and removing duplicates, then hitting those URLs. We will hit some URLs more than once as things stand, where an article appears in the news and communications search results for more than one person. Duplicates are only avoided because we keep the govuk_string that led us to scrape an article rather than the people tagged in the article in the data we save to SQL # noqa: E501
	result = [
	configure_scraper(x, article_date_min, article_date_max)
	for x in df_govuk_person_to_scrape['govuk_string']
	]

	# Save results to dataframe, and correct data type of one column
	df_govkukarticles = pd.DataFrame(rows_list, )
	df_govkukarticles['date'] = pd.to_datetime(df_govkukarticles['date'])

	df_status_codes = pd.DataFrame(status_codes_list, )

	# %%
	# SET DF DISPLAY OPTIONS
	pd.options.display.max_rows = None

	# %%
	# PRINT STATUS CODES FOR MANUAL QA
	df_status_codes

	# %%
	# PRINT RESULTS FOR MANUAL QA
	df_govkukarticles

	# %%
	# SAVE SCRAPED DATA TO PICKLE
	df_govkukarticles.to_pickle('data/govukarticles_20150101_20211231.pkl')

	# %%
	# TURN GOV.UK ARTICLES DATA INTO A TIDY DATASET
	# Explode departments column, which contains a list, into separate rows and rename column # noqa: E501
	df_govkukarticles_tidy = (
	df_govkukarticles.explode('departments')
	.reset_index(drop=True) # Without this, newly created rows are given duplicate indexes # noqa: E501
	)

	df_govkukarticles_tidy.rename(
	columns={
	'departments': 'department'
	},
	inplace=True
	)

	# Drop people column
	df_govkukarticles_tidy = df_govkukarticles_tidy.drop('people', axis=1)

	# %%
	# SAVE RESULTS TO SQL
	pd.io.sql.execute(
	'drop table if exists reference.[ukgovt.govukarticles_20150101_20211231]',
	con=engine
	)
	df_govkukarticles_tidy.to_sql(
	'ukgovt.govukarticles_20150101_20211231',
	schema='reference',
	con=engine,
	dtype={
	'govuk_string': sqlalchemy.types.NVARCHAR(length=255),
	'url': sqlalchemy.types.NVARCHAR(length=255),
	'article_type': sqlalchemy.types.NVARCHAR(length=255),
	'title': sqlalchemy.types.NVARCHAR(length=255),
	'date': sqlalchemy.types.DATE,
	'department': sqlalchemy.types.NVARCHAR(length=255)
	},
	index=False
	)